summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/aio.c2
-rw-r--r--fs/binfmt_elf_fdpic.c5
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/block-group.c12
-rw-r--r--fs/btrfs/delayed-inode.c104
-rw-r--r--fs/btrfs/delayed-ref.c46
-rw-r--r--fs/btrfs/delayed-ref.h1
-rw-r--r--fs/btrfs/disk-io.c22
-rw-r--r--fs/btrfs/extent-tree.c18
-rw-r--r--fs/btrfs/extent_io.c17
-rw-r--r--fs/btrfs/file.c40
-rw-r--r--fs/btrfs/inode.c33
-rw-r--r--fs/btrfs/ioctl.c8
-rw-r--r--fs/btrfs/locking.h2
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/super.c7
-rw-r--r--fs/btrfs/transaction.c45
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/verity.c64
-rw-r--r--fs/btrfs/volumes.c13
-rw-r--r--fs/buffer.c36
-rw-r--r--fs/ceph/crypto.c6
-rw-r--r--fs/efivarfs/super.c14
-rw-r--r--fs/ext4/mballoc.c54
-rw-r--r--fs/ext4/namei.c26
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/fs-writeback.c11
-rw-r--r--fs/gfs2/glock.c4
-rw-r--r--fs/gfs2/glops.c9
-rw-r--r--fs/gfs2/quota.h3
-rw-r--r--fs/inode.c82
-rw-r--r--fs/iomap/buffered-io.c32
-rw-r--r--fs/jbd2/commit.c16
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jbd2/transaction.c12
-rw-r--r--fs/libfs.c1
-rw-r--r--fs/netfs/buffered_read.c6
-rw-r--r--fs/nfs/direct.c134
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c1
-rw-r--r--fs/nfs/nfs4client.c6
-rw-r--r--fs/nfs/nfs4proc.c10
-rw-r--r--fs/nfs/nfs4state.c45
-rw-r--r--fs/nfs/write.c25
-rw-r--r--fs/nfsd/nfs4proc.c4
-rw-r--r--fs/nfsd/nfs4xdr.c4
-rw-r--r--fs/nfsd/nfssvc.c5
-rw-r--r--fs/nilfs2/gcinode.c6
-rw-r--r--fs/ntfs3/super.c1
-rw-r--r--fs/overlayfs/copy_up.c5
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/file.c17
-rw-r--r--fs/overlayfs/ovl_entry.h10
-rw-r--r--fs/overlayfs/params.c17
-rw-r--r--fs/overlayfs/super.c27
-rw-r--r--fs/pipe.c1
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/task_nommu.c64
-rw-r--r--fs/reiserfs/reiserfs.h6
-rw-r--r--fs/smb/client/cached_dir.c6
-rw-r--r--fs/smb/client/cifsglob.h3
-rw-r--r--fs/smb/client/cifsproto.h2
-rw-r--r--fs/smb/client/connect.c8
-rw-r--r--fs/smb/client/fs_context.c1
-rw-r--r--fs/smb/client/inode.c2
-rw-r--r--fs/smb/client/misc.c14
-rw-r--r--fs/smb/client/smb2inode.c3
-rw-r--r--fs/smb/client/smb2maperror.c2
-rw-r--r--fs/smb/client/smb2ops.c6
-rw-r--r--fs/smb/client/smb2pdu.c35
-rw-r--r--fs/smb/client/smbdirect.c9
-rw-r--r--fs/smb/client/trace.h2
-rw-r--r--fs/smb/client/transport.c36
-rw-r--r--fs/smb/server/connection.c3
-rw-r--r--fs/smb/server/server.c4
-rw-r--r--fs/smb/server/smb2misc.c4
-rw-r--r--fs/smb/server/smb2pdu.c2
-rw-r--r--fs/smb/server/smbacl.c1
-rw-r--r--fs/stat.c47
-rw-r--r--fs/tracefs/event_inode.c146
-rw-r--r--fs/tracefs/inode.c5
-rw-r--r--fs/tracefs/internal.h5
-rw-r--r--fs/xfs/Kconfig2
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h22
-rw-r--r--fs/xfs/libxfs/xfs_sb.c3
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c6
-rw-r--r--fs/xfs/scrub/scrub.c4
-rw-r--r--fs/xfs/scrub/stats.c5
-rw-r--r--fs/xfs/xfs_attr_inactive.c1
-rw-r--r--fs/xfs/xfs_attr_item.c7
-rw-r--r--fs/xfs/xfs_bmap_item.c4
-rw-r--r--fs/xfs/xfs_export.c14
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_fsmap.c25
-rw-r--r--fs/xfs/xfs_icache.c80
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_inode.c231
-rw-r--r--fs/xfs/xfs_inode.h34
-rw-r--r--fs/xfs/xfs_iops.c6
-rw-r--r--fs/xfs/xfs_itable.c11
-rw-r--r--fs/xfs/xfs_log.c17
-rw-r--r--fs/xfs/xfs_log_cil.c52
-rw-r--r--fs/xfs/xfs_log_priv.h14
-rw-r--r--fs/xfs/xfs_log_recover.c4
-rw-r--r--fs/xfs/xfs_mount.h17
-rw-r--r--fs/xfs/xfs_qm.c16
-rw-r--r--fs/xfs/xfs_refcount_item.c6
-rw-r--r--fs/xfs/xfs_rmap_item.c6
-rw-r--r--fs/xfs/xfs_super.c88
-rw-r--r--fs/xfs/xfs_trace.h45
-rw-r--r--fs/xfs/xfs_xattr.c11
111 files changed, 1385 insertions, 851 deletions
diff --git a/fs/aio.c b/fs/aio.c
index a4c2a6bac72c..f8589caef9c1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -80,7 +80,7 @@ struct aio_ring {
struct kioctx_table {
struct rcu_head rcu;
unsigned nr;
- struct kioctx __rcu *table[];
+ struct kioctx __rcu *table[] __counted_by(nr);
};
struct kioctx_cpu {
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 43b2a2851ba3..206812ce544a 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -345,10 +345,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
/* there's now no turning back... the old userspace image is dead,
* defunct, deceased, etc.
*/
+ SET_PERSONALITY(exec_params.hdr);
if (elf_check_fdpic(&exec_params.hdr))
- set_personality(PER_LINUX_FDPIC);
- else
- set_personality(PER_LINUX);
+ current->personality |= PER_LINUX_FDPIC;
if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 3282adc84d52..a25c9910d90b 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,7 +31,7 @@ config BTRFS_FS
continue to be mountable and usable by newer kernels.
For more information, please see the web pages at
- http://btrfs.wiki.kernel.org.
+ https://btrfs.readthedocs.io
To compile this file system support as a module, choose M here. The
module will be called btrfs.
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 0cb1dee965a0..b2e5107b7cec 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -3028,8 +3028,16 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
fail:
btrfs_release_path(path);
- /* We didn't update the block group item, need to revert @commit_used. */
- if (ret < 0) {
+ /*
+ * We didn't update the block group item, need to revert commit_used
+ * unless the block group item didn't exist yet - this is to prevent a
+ * race with a concurrent insertion of the block group item, with
+ * insert_block_group_item(), that happened just after we attempted to
+ * update. In that case we would reset commit_used to 0 just after the
+ * insertion set it to a value greater than 0 - if the block group later
+ * becomes with 0 used bytes, we would incorrectly skip its update.
+ */
+ if (ret < 0 && ret != -ENOENT) {
spin_lock(&cache->lock);
cache->commit_used = old_commit_used;
spin_unlock(&cache->lock);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 53c1211dd60b..caf0bbd028d1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -412,6 +412,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
{
+ struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node;
struct rb_root_cached *root;
struct btrfs_delayed_root *delayed_root;
@@ -419,18 +420,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
if (RB_EMPTY_NODE(&delayed_item->rb_node))
return;
- delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
+ /* If it's in a rbtree, then we need to have delayed node locked. */
+ lockdep_assert_held(&delayed_node->mutex);
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
BUG_ON(!delayed_root);
if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
- root = &delayed_item->delayed_node->ins_root;
+ root = &delayed_node->ins_root;
else
- root = &delayed_item->delayed_node->del_root;
+ root = &delayed_node->del_root;
rb_erase_cached(&delayed_item->rb_node, root);
RB_CLEAR_NODE(&delayed_item->rb_node);
- delayed_item->delayed_node->count--;
+ delayed_node->count--;
finish_one_item(delayed_root);
}
@@ -1153,20 +1157,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
ret = __btrfs_commit_inode_delayed_items(trans, path,
curr_node);
if (ret) {
- btrfs_release_delayed_node(curr_node);
- curr_node = NULL;
btrfs_abort_transaction(trans, ret);
break;
}
prev_node = curr_node;
curr_node = btrfs_next_delayed_node(curr_node);
+ /*
+ * See the comment below about releasing path before releasing
+ * node. If the commit of delayed items was successful the path
+ * should always be released, but in case of an error, it may
+ * point to locked extent buffers (a leaf at the very least).
+ */
+ ASSERT(path->nodes[0] == NULL);
btrfs_release_delayed_node(prev_node);
}
+ /*
+ * Release the path to avoid a potential deadlock and lockdep splat when
+ * releasing the delayed node, as that requires taking the delayed node's
+ * mutex. If another task starts running delayed items before we take
+ * the mutex, it will first lock the mutex and then it may try to lock
+ * the same btree path (leaf).
+ */
+ btrfs_free_path(path);
+
if (curr_node)
btrfs_release_delayed_node(curr_node);
- btrfs_free_path(path);
trans->block_rsv = block_rsv;
return ret;
@@ -1413,7 +1430,29 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
}
-/* Will return 0 or -ENOMEM */
+static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ return;
+
+ /*
+ * Adding the new dir index item does not require touching another
+ * leaf, so we can release 1 unit of metadata that was previously
+ * reserved when starting the transaction. This applies only to
+ * the case where we had a transaction start and excludes the
+ * transaction join case (when replaying log trees).
+ */
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ trans->transid, bytes, 0);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
+ ASSERT(trans->bytes_reserved >= bytes);
+ trans->bytes_reserved -= bytes;
+}
+
+/* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
@@ -1455,6 +1494,27 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
mutex_lock(&delayed_node->mutex);
+ /*
+ * First attempt to insert the delayed item. This is to make the error
+ * handling path simpler in case we fail (-EEXIST). There's no risk of
+ * any other task coming in and running the delayed item before we do
+ * the metadata space reservation below, because we are holding the
+ * delayed node's mutex and that mutex must also be locked before the
+ * node's delayed items can be run.
+ */
+ ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
+ if (unlikely(ret)) {
+ btrfs_err(trans->fs_info,
+"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d",
+ name_len, name, index, btrfs_root_id(delayed_node->root),
+ delayed_node->inode_id, dir->index_cnt,
+ delayed_node->index_cnt, ret);
+ btrfs_release_delayed_item(delayed_item);
+ btrfs_release_dir_index_item_space(trans);
+ mutex_unlock(&delayed_node->mutex);
+ goto release_node;
+ }
+
if (delayed_node->index_item_leaves == 0 ||
delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
delayed_node->curr_index_batch_size = data_len;
@@ -1472,36 +1532,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
* impossible.
*/
if (WARN_ON(ret)) {
- mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_item(delayed_item);
+ mutex_unlock(&delayed_node->mutex);
goto release_node;
}
delayed_node->index_item_leaves++;
- } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
- const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
-
- /*
- * Adding the new dir index item does not require touching another
- * leaf, so we can release 1 unit of metadata that was previously
- * reserved when starting the transaction. This applies only to
- * the case where we had a transaction start and excludes the
- * transaction join case (when replaying log trees).
- */
- trace_btrfs_space_reservation(fs_info, "transaction",
- trans->transid, bytes, 0);
- btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
- ASSERT(trans->bytes_reserved >= bytes);
- trans->bytes_reserved -= bytes;
- }
-
- ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
- if (unlikely(ret)) {
- btrfs_err(trans->fs_info,
- "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- name_len, name, delayed_node->root->root_key.objectid,
- delayed_node->inode_id, ret);
- BUG();
+ } else {
+ btrfs_release_dir_index_item_space(trans);
}
mutex_unlock(&delayed_node->mutex);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6a13cf00218b..9fe4ccca50a0 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -103,24 +103,17 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
* Transfer bytes to our delayed refs rsv.
*
* @fs_info: the filesystem
- * @src: source block rsv to transfer from
* @num_bytes: number of bytes to transfer
*
- * This transfers up to the num_bytes amount from the src rsv to the
+ * This transfers up to the num_bytes amount, previously reserved, to the
* delayed_refs_rsv. Any extra bytes are returned to the space info.
*/
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *src,
u64 num_bytes)
{
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
u64 to_free = 0;
- spin_lock(&src->lock);
- src->reserved -= num_bytes;
- src->size -= num_bytes;
- spin_unlock(&src->lock);
-
spin_lock(&delayed_refs_rsv->lock);
if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
u64 delta = delayed_refs_rsv->size -
@@ -163,6 +156,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1);
u64 num_bytes = 0;
+ u64 refilled_bytes;
+ u64 to_free;
int ret = -ENOSPC;
spin_lock(&block_rsv->lock);
@@ -178,9 +173,38 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (ret)
return ret;
- btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
- trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
- 0, num_bytes, 1);
+
+ /*
+ * We may have raced with someone else, so check again if we the block
+ * reserve is still not full and release any excess space.
+ */
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size) {
+ u64 needed = block_rsv->size - block_rsv->reserved;
+
+ if (num_bytes >= needed) {
+ block_rsv->reserved += needed;
+ block_rsv->full = true;
+ to_free = num_bytes - needed;
+ refilled_bytes = needed;
+ } else {
+ block_rsv->reserved += num_bytes;
+ to_free = 0;
+ refilled_bytes = num_bytes;
+ }
+ } else {
+ to_free = num_bytes;
+ refilled_bytes = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (to_free > 0)
+ btrfs_space_info_free_bytes_may_use(fs_info, block_rsv->space_info,
+ to_free);
+
+ if (refilled_bytes > 0)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0,
+ refilled_bytes, 1);
return 0;
}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index b8e14b0ba5f1..fd9bf2b709c0 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -407,7 +407,6 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *src,
u64 num_bytes);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0a96ea8c1d3a..68f60d50e1fd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -520,6 +520,7 @@ static bool btree_dirty_folio(struct address_space *mapping,
struct folio *folio)
{
struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+ struct btrfs_subpage_info *spi = fs_info->subpage_info;
struct btrfs_subpage *subpage;
struct extent_buffer *eb;
int cur_bit = 0;
@@ -533,18 +534,19 @@ static bool btree_dirty_folio(struct address_space *mapping,
btrfs_assert_tree_write_locked(eb);
return filemap_dirty_folio(mapping, folio);
}
+
+ ASSERT(spi);
subpage = folio_get_private(folio);
- ASSERT(subpage->dirty_bitmap);
- while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
+ for (cur_bit = spi->dirty_offset;
+ cur_bit < spi->dirty_offset + spi->bitmap_nr_bits;
+ cur_bit++) {
unsigned long flags;
u64 cur;
- u16 tmp = (1 << cur_bit);
spin_lock_irqsave(&subpage->lock, flags);
- if (!(tmp & subpage->dirty_bitmap)) {
+ if (!test_bit(cur_bit, subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
- cur_bit++;
continue;
}
spin_unlock_irqrestore(&subpage->lock, flags);
@@ -557,7 +559,7 @@ static bool btree_dirty_folio(struct address_space *mapping,
btrfs_assert_tree_write_locked(eb);
free_extent_buffer(eb);
- cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
+ cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1;
}
return filemap_dirty_folio(mapping, folio);
}
@@ -1547,7 +1549,7 @@ static int transaction_kthread(void *arg)
delta = ktime_get_seconds() - cur->start_time;
if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
- cur->state < TRANS_STATE_COMMIT_START &&
+ cur->state < TRANS_STATE_COMMIT_PREP &&
delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
delay -= msecs_to_jiffies((delta - 1) * 1000);
@@ -2682,8 +2684,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
- btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start,
- BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep,
+ BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
BTRFS_LOCKDEP_TRANS_UNBLOCKED);
btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
@@ -4870,7 +4872,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
while (!list_empty(&fs_info->trans_list)) {
t = list_first_entry(&fs_info->trans_list,
struct btrfs_transaction, list);
- if (t->state >= TRANS_STATE_COMMIT_START) {
+ if (t->state >= TRANS_STATE_COMMIT_PREP) {
refcount_inc(&t->use_count);
spin_unlock(&fs_info->trans_lock);
btrfs_wait_for_commit(fs_info, t->transid);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f356f08b55cb..fc313fce5bbd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1514,15 +1514,14 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
/* now insert the actual backref */
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- BUG_ON(refs_to_add != 1);
+ if (owner < BTRFS_FIRST_FREE_OBJECTID)
ret = insert_tree_block_ref(trans, path, bytenr, parent,
root_objectid);
- } else {
+ else
ret = insert_extent_data_ref(trans, path, bytenr, parent,
root_objectid, owner, offset,
refs_to_add);
- }
+
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -1656,7 +1655,10 @@ again:
goto again;
}
} else {
- err = -EIO;
+ err = -EUCLEAN;
+ btrfs_err(fs_info,
+ "missing extent item for extent %llu num_bytes %llu level %d",
+ head->bytenr, head->num_bytes, extent_op->level);
goto out;
}
}
@@ -1699,12 +1701,12 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent = ref->parent;
ref_root = ref->root;
- if (node->ref_mod != 1) {
+ if (unlikely(node->ref_mod != 1)) {
btrfs_err(trans->fs_info,
- "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
+ "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
node->bytenr, node->ref_mod, node->action, ref_root,
parent);
- return -EIO;
+ return -EUCLEAN;
}
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
BUG_ON(!extent_op || !extent_op->update_flags);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ac3fca5a5e41..caccd0376342 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -484,10 +484,8 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)
bvec->bv_offset, bvec->bv_len);
btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error);
- if (error) {
- btrfs_page_clear_uptodate(fs_info, page, start, len);
+ if (error)
mapping_set_error(page->mapping, error);
- }
btrfs_page_clear_writeback(fs_info, page, start, len);
}
@@ -1456,8 +1454,6 @@ done:
if (ret) {
btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start,
PAGE_SIZE, !ret);
- btrfs_page_clear_uptodate(btrfs_sb(inode->i_sb), page,
- page_start, PAGE_SIZE);
mapping_set_error(page->mapping, ret);
}
unlock_page(page);
@@ -1624,8 +1620,6 @@ static void extent_buffer_write_end_io(struct btrfs_bio *bbio)
struct page *page = bvec->bv_page;
u32 len = bvec->bv_len;
- if (!uptodate)
- btrfs_page_clear_uptodate(fs_info, page, start, len);
btrfs_page_clear_writeback(fs_info, page, start, len);
bio_offset += len;
}
@@ -2201,7 +2195,6 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
if (ret) {
btrfs_mark_ordered_io_finished(BTRFS_I(inode), page,
cur, cur_len, !ret);
- btrfs_page_clear_uptodate(fs_info, page, cur, cur_len);
mapping_set_error(page->mapping, ret);
}
btrfs_page_unlock_writer(fs_info, page, cur, cur_len);
@@ -4002,8 +3995,14 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
char *dst = (char *)dstv;
unsigned long i = get_eb_page_index(start);
- if (check_eb_range(eb, start, len))
+ if (check_eb_range(eb, start, len)) {
+ /*
+ * Invalid range hit, reset the memory, so callers won't get
+ * some random garbage for their uninitialzed memory.
+ */
+ memset(dstv, 0, len);
return;
+ }
offset = get_eb_offset_in_page(eb, start);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ca46a529d56b..361535c71c0f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1106,6 +1106,25 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
}
+static void update_time_for_write(struct inode *inode)
+{
+ struct timespec64 now, ctime;
+
+ if (IS_NOCMTIME(inode))
+ return;
+
+ now = current_time(inode);
+ if (!timespec64_equal(&inode->i_mtime, &now))
+ inode->i_mtime = now;
+
+ ctime = inode_get_ctime(inode);
+ if (!timespec64_equal(&ctime, &now))
+ inode_set_ctime_to_ts(inode, now);
+
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
+}
+
static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
size_t count)
{
@@ -1137,10 +1156,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
* need to start yet another transaction to update the inode as we will
* update the inode when we finish writing whatever data we write.
*/
- if (!IS_NOCMTIME(inode)) {
- inode->i_mtime = inode_set_ctime_current(inode);
- inode_inc_iversion(inode);
- }
+ update_time_for_write(inode);
start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
@@ -1451,8 +1467,13 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_NOWAIT)
ilock_flags |= BTRFS_ILOCK_TRY;
- /* If the write DIO is within EOF, use a shared lock */
- if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
+ /*
+ * If the write DIO is within EOF, use a shared lock and also only if
+ * security bits will likely not be dropped by file_remove_privs() called
+ * from btrfs_write_check(). Either will need to be rechecked after the
+ * lock was acquired.
+ */
+ if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
ilock_flags |= BTRFS_ILOCK_SHARED;
relock:
@@ -1460,6 +1481,13 @@ relock:
if (err < 0)
return err;
+ /* Shared lock cannot be used with security bits set. */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
+
err = generic_write_checks(iocb, from);
if (err <= 0) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f09fbdc43f0f..7814b9d654ce 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1085,9 +1085,6 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
btrfs_mark_ordered_io_finished(inode, locked_page,
page_start, PAGE_SIZE,
!ret);
- btrfs_page_clear_uptodate(inode->root->fs_info,
- locked_page, page_start,
- PAGE_SIZE);
mapping_set_error(locked_page->mapping, ret);
unlock_page(locked_page);
}
@@ -2791,7 +2788,6 @@ out_page:
mapping_set_error(page->mapping, ret);
btrfs_mark_ordered_io_finished(inode, page, page_start,
PAGE_SIZE, !ret);
- btrfs_page_clear_uptodate(fs_info, page, page_start, PAGE_SIZE);
clear_page_dirty_for_io(page);
}
btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
@@ -5769,20 +5765,24 @@ out:
static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
{
- if (dir->index_cnt == (u64)-1) {
- int ret;
+ int ret = 0;
+ btrfs_inode_lock(dir, 0);
+ if (dir->index_cnt == (u64)-1) {
ret = btrfs_inode_delayed_dir_index_count(dir);
if (ret) {
ret = btrfs_set_inode_index_count(dir);
if (ret)
- return ret;
+ goto out;
}
}
- *index = dir->index_cnt;
+ /* index_cnt is the index number of next new entry, so decrement it. */
+ *index = dir->index_cnt - 1;
+out:
+ btrfs_inode_unlock(dir, 0);
- return 0;
+ return ret;
}
/*
@@ -5817,6 +5817,19 @@ static int btrfs_opendir(struct inode *inode, struct file *file)
return 0;
}
+static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct btrfs_file_private *private = file->private_data;
+ int ret;
+
+ ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
+ &private->last_index);
+ if (ret)
+ return ret;
+
+ return generic_file_llseek(file, offset, whence);
+}
+
struct dir_entry {
u64 ino;
u64 offset;
@@ -10868,7 +10881,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
};
static const struct file_operations btrfs_dir_file_operations = {
- .llseek = generic_file_llseek,
+ .llseek = btrfs_dir_llseek,
.read = generic_read_dir,
.iterate_shared = btrfs_real_readdir,
.open = btrfs_opendir,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a18ee7b5a166..75ab766fe156 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1958,6 +1958,13 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
goto out_put;
}
+ /*
+ * We don't need the path anymore, so release it and
+ * avoid deadlocks and lockdep warnings in case
+ * btrfs_iget() needs to lookup the inode from its root
+ * btree and lock the same leaf.
+ */
+ btrfs_release_path(path);
temp_inode = btrfs_iget(sb, key2.objectid, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
@@ -1978,7 +1985,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
goto out_put;
}
- btrfs_release_path(path);
key.objectid = key.offset;
key.offset = (u64)-1;
dirid = key.objectid;
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index edb9b4a0dba1..7d6ee1e609bf 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -79,7 +79,7 @@ enum btrfs_lock_nesting {
};
enum btrfs_lockdep_trans_states {
- BTRFS_LOCKDEP_TRANS_COMMIT_START,
+ BTRFS_LOCKDEP_TRANS_COMMIT_PREP,
BTRFS_LOCKDEP_TRANS_UNBLOCKED,
BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
BTRFS_LOCKDEP_TRANS_COMPLETED,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b46ab348e8e5..345c449d588c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -639,7 +639,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
- ASSERT(trans);
+ ASSERT(trans || BTRFS_FS_ERROR(fs_info));
if (trans) {
if (atomic_dec_and_test(&trans->pending_ordered))
wake_up(&trans->pending_wait);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 09bfe68d2ea3..1a093ec0f7e3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2117,7 +2117,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
* calculated f_bavail.
*/
if (!mixed && block_rsv->space_info->full &&
- total_free_meta - thresh < block_rsv->size)
+ (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size))
buf->f_bavail = 0;
buf->f_type = BTRFS_SUPER_MAGIC;
@@ -2150,7 +2150,7 @@ static struct file_system_type btrfs_fs_type = {
.name = "btrfs",
.mount = btrfs_mount,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
};
static struct file_system_type btrfs_root_fs_type = {
@@ -2158,8 +2158,7 @@ static struct file_system_type btrfs_root_fs_type = {
.name = "btrfs",
.mount = btrfs_mount_root,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA |
- FS_ALLOW_IDMAP | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("btrfs");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 874e4394df86..c780d3729463 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -56,12 +56,17 @@ static struct kmem_cache *btrfs_trans_handle_cachep;
* | Call btrfs_commit_transaction() on any trans handle attached to
* | transaction N
* V
- * Transaction N [[TRANS_STATE_COMMIT_START]]
+ * Transaction N [[TRANS_STATE_COMMIT_PREP]]
+ * |
+ * | If there are simultaneous calls to btrfs_commit_transaction() one will win
+ * | the race and the rest will wait for the winner to commit the transaction.
+ * |
+ * | The winner will wait for previous running transaction to completely finish
+ * | if there is one.
* |
- * | Will wait for previous running transaction to completely finish if there
- * | is one
+ * Transaction N [[TRANS_STATE_COMMIT_START]]
* |
- * | Then one of the following happes:
+ * | Then one of the following happens:
* | - Wait for all other trans handle holders to release.
* | The btrfs_commit_transaction() caller will do the commit work.
* | - Wait for current transaction to be committed by others.
@@ -112,6 +117,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep;
*/
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING] = 0U,
+ [TRANS_STATE_COMMIT_PREP] = 0U,
[TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH),
[TRANS_STATE_COMMIT_DOING] = (__TRANS_START |
__TRANS_ATTACH |
@@ -625,14 +631,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
reloc_reserved = true;
}
- ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, rsv, num_bytes, flush);
if (ret)
goto reserve_fail;
if (delayed_refs_bytes) {
- btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv,
- delayed_refs_bytes);
+ btrfs_migrate_to_delayed_refs_rsv(fs_info, delayed_refs_bytes);
num_bytes -= delayed_refs_bytes;
}
+ btrfs_block_rsv_add_bytes(rsv, num_bytes, true);
if (rsv->space_info->force_alloc)
do_chunk_alloc = true;
@@ -1982,7 +1988,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
* Wait for the current transaction commit to start and block
* subsequent transaction joins
*/
- btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
wait_event(fs_info->transaction_blocked_wait,
cur_trans->state >= TRANS_STATE_COMMIT_START ||
TRANS_ABORTED(cur_trans));
@@ -2129,7 +2135,7 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans)
return;
lockdep_assert_held(&trans->fs_info->trans_lock);
- ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START);
+ ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP);
list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}
@@ -2153,7 +2159,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ktime_t interval;
ASSERT(refcount_read(&trans->use_count) == 1);
- btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
@@ -2213,7 +2219,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
}
spin_lock(&fs_info->trans_lock);
- if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+ if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
add_pending_snapshot(trans);
@@ -2225,7 +2231,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
want_state = TRANS_STATE_SUPER_COMMITTED;
btrfs_trans_state_lockdep_release(fs_info,
- BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
ret = btrfs_end_transaction(trans);
wait_for_commit(cur_trans, want_state);
@@ -2237,9 +2243,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
- cur_trans->state = TRANS_STATE_COMMIT_START;
+ cur_trans->state = TRANS_STATE_COMMIT_PREP;
wake_up(&fs_info->transaction_blocked_wait);
- btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
if (cur_trans->list.prev != &fs_info->trans_list) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
@@ -2260,11 +2266,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_put_transaction(prev_trans);
if (ret)
goto lockdep_release;
- } else {
- spin_unlock(&fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
}
} else {
- spin_unlock(&fs_info->trans_lock);
/*
* The previous transaction was aborted and was already removed
* from the list of transactions at fs_info->trans_list. So we
@@ -2272,11 +2276,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* corrupt state (pointing to trees with unwritten nodes/leafs).
*/
if (BTRFS_FS_ERROR(fs_info)) {
+ spin_unlock(&fs_info->trans_lock);
ret = -EROFS;
goto lockdep_release;
}
}
+ cur_trans->state = TRANS_STATE_COMMIT_START;
+ wake_up(&fs_info->transaction_blocked_wait);
+ spin_unlock(&fs_info->trans_lock);
+
/*
* Get the time spent on the work done by the commit thread and not
* the time spent waiting on a previous commit
@@ -2586,7 +2595,7 @@ lockdep_release:
goto cleanup_transaction;
lockdep_trans_commit_start_release:
- btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
btrfs_end_transaction(trans);
return ret;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 8e9fa23bd7fe..6b309f8a99a8 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -14,6 +14,7 @@
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
+ TRANS_STATE_COMMIT_PREP,
TRANS_STATE_COMMIT_START,
TRANS_STATE_COMMIT_DOING,
TRANS_STATE_UNBLOCKED,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d1e46b839519..cbb17b542131 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4722,7 +4722,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int slot;
int ins_nr = 0;
- int start_slot;
+ int start_slot = 0;
int ret;
if (!(inode->flags & BTRFS_INODE_PREALLOC))
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index c5ff16f9e9fa..744f4f4d4c68 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -715,7 +715,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- struct page *page;
+ struct folio *folio;
u64 off = (u64)index << PAGE_SHIFT;
loff_t merkle_pos = merkle_file_pos(inode);
int ret;
@@ -726,29 +726,36 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
return ERR_PTR(-EFBIG);
index += merkle_pos >> PAGE_SHIFT;
again:
- page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
- if (page) {
- if (PageUptodate(page))
- return page;
+ folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
+ if (!IS_ERR(folio)) {
+ if (folio_test_uptodate(folio))
+ goto out;
- lock_page(page);
- /*
- * We only insert uptodate pages, so !Uptodate has to be
- * an error
- */
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
+ folio_lock(folio);
+ /* If it's not uptodate after we have the lock, we got a read error. */
+ if (!folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(-EIO);
}
- unlock_page(page);
- return page;
+ folio_unlock(folio);
+ goto out;
}
- page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
- if (!page)
+ folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS),
+ 0);
+ if (!folio)
return ERR_PTR(-ENOMEM);
+ ret = filemap_add_folio(inode->i_mapping, folio, index, GFP_NOFS);
+ if (ret) {
+ folio_put(folio);
+ /* Did someone else insert a folio here? */
+ if (ret == -EEXIST)
+ goto again;
+ return ERR_PTR(ret);
+ }
+
/*
* Merkle item keys are indexed from byte 0 in the merkle tree.
* They have the form:
@@ -756,28 +763,19 @@ again:
* [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
*/
ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
- page_address(page), PAGE_SIZE, page);
+ folio_address(folio), PAGE_SIZE, &folio->page);
if (ret < 0) {
- put_page(page);
+ folio_put(folio);
return ERR_PTR(ret);
}
if (ret < PAGE_SIZE)
- memzero_page(page, ret, PAGE_SIZE - ret);
+ folio_zero_segment(folio, ret, PAGE_SIZE);
- SetPageUptodate(page);
- ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
- if (!ret) {
- /* Inserted and ready for fsverity */
- unlock_page(page);
- } else {
- put_page(page);
- /* Did someone race us into inserting this page? */
- if (ret == -EEXIST)
- goto again;
- page = ERR_PTR(ret);
- }
- return page;
+out:
+ return folio_file_page(folio, index);
}
/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9621455edebc..5a5a8d488a7b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1594,7 +1594,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 search_start;
u64 hole_size;
u64 max_hole_start;
- u64 max_hole_size;
+ u64 max_hole_size = 0;
u64 extent_end;
u64 search_end = device->total_bytes;
int ret;
@@ -1602,17 +1602,16 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
struct extent_buffer *l;
search_start = dev_extent_search_start(device);
+ max_hole_start = search_start;
WARN_ON(device->zone_info &&
!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- max_hole_start = search_start;
- max_hole_size = 0;
-
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
again:
if (search_start >= search_end ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 2379564e5aea..12e9a71c693d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2011,7 +2011,7 @@ void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
}
EXPORT_SYMBOL(folio_zero_new_buffers);
-static void
+static int
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
const struct iomap *iomap)
{
@@ -2025,7 +2025,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
* current block, then do not map the buffer and let the caller
* handle it.
*/
- BUG_ON(offset >= iomap->offset + iomap->length);
+ if (offset >= iomap->offset + iomap->length)
+ return -EIO;
switch (iomap->type) {
case IOMAP_HOLE:
@@ -2037,7 +2038,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
if (!buffer_uptodate(bh) ||
(offset >= i_size_read(inode)))
set_buffer_new(bh);
- break;
+ return 0;
case IOMAP_DELALLOC:
if (!buffer_uptodate(bh) ||
(offset >= i_size_read(inode)))
@@ -2045,7 +2046,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
set_buffer_uptodate(bh);
set_buffer_mapped(bh);
set_buffer_delay(bh);
- break;
+ return 0;
case IOMAP_UNWRITTEN:
/*
* For unwritten regions, we always need to ensure that regions
@@ -2057,12 +2058,24 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
fallthrough;
case IOMAP_MAPPED:
if ((iomap->flags & IOMAP_F_NEW) ||
- offset >= i_size_read(inode))
+ offset >= i_size_read(inode)) {
+ /*
+ * This can happen if truncating the block device races
+ * with the check in the caller as i_size updates on
+ * block devices aren't synchronized by i_rwsem for
+ * block devices.
+ */
+ if (S_ISBLK(inode->i_mode))
+ return -EIO;
set_buffer_new(bh);
+ }
bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
inode->i_blkbits;
set_buffer_mapped(bh);
- break;
+ return 0;
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
}
}
@@ -2103,13 +2116,12 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- if (get_block) {
+ if (get_block)
err = get_block(inode, block, bh, 1);
- if (err)
- break;
- } else {
- iomap_to_bh(inode, block, bh, iomap);
- }
+ else
+ err = iomap_to_bh(inode, block, bh, iomap);
+ if (err)
+ break;
if (buffer_new(bh)) {
clean_bdev_bh_alias(bh);
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index e4d5cd56a80b..e1f31b86fd48 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -249,11 +249,9 @@ static struct inode *parse_longname(const struct inode *parent,
if (!dir) {
/* This can happen if we're not mounting cephfs on the root */
dir = ceph_get_inode(parent->i_sb, vino, NULL);
- if (!dir)
- dir = ERR_PTR(-ENOENT);
+ if (IS_ERR(dir))
+ dout("Can't find inode %s (%s)\n", inode_number, name);
}
- if (IS_ERR(dir))
- dout("Can't find inode %s (%s)\n", inode_number, name);
out:
kfree(inode_number);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index e028fafa04f3..996271473609 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -32,10 +32,16 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 storage_space, remaining_space, max_variable_size;
efi_status_t status;
- status = efivar_query_variable_info(attr, &storage_space, &remaining_space,
- &max_variable_size);
- if (status != EFI_SUCCESS)
- return efi_status_to_err(status);
+ /* Some UEFI firmware does not implement QueryVariableInfo() */
+ storage_space = remaining_space = 0;
+ if (efi_rt_services_supported(EFI_RT_SUPPORTED_QUERY_VARIABLE_INFO)) {
+ status = efivar_query_variable_info(attr, &storage_space,
+ &remaining_space,
+ &max_variable_size);
+ if (status != EFI_SUCCESS && status != EFI_UNSUPPORTED)
+ pr_warn_ratelimited("query_variable_info() failed: 0x%lx\n",
+ status);
+ }
/*
* This is not a normal filesystem, so no point in pretending it has a block
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c91db9f57524..1e599305d85f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/backing-dev.h>
+#include <linux/freezer.h>
#include <trace/events/ext4.h>
/*
@@ -6906,6 +6907,21 @@ __acquires(bitlock)
return ret;
}
+static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
+ ext4_group_t grp)
+{
+ if (grp < ext4_get_groups_count(sb))
+ return EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+ return (ext4_blocks_count(EXT4_SB(sb)->s_es) -
+ ext4_group_first_block_no(sb, grp) - 1) >>
+ EXT4_CLUSTER_BITS(sb);
+}
+
+static bool ext4_trim_interrupted(void)
+{
+ return fatal_signal_pending(current) || freezing(current);
+}
+
static int ext4_try_to_trim_range(struct super_block *sb,
struct ext4_buddy *e4b, ext4_grpblk_t start,
ext4_grpblk_t max, ext4_grpblk_t minblocks)
@@ -6913,9 +6929,12 @@ __acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
{
ext4_grpblk_t next, count, free_count;
+ bool set_trimmed = false;
void *bitmap;
bitmap = e4b->bd_bitmap;
+ if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group))
+ set_trimmed = true;
start = max(e4b->bd_info->bb_first_free, start);
count = 0;
free_count = 0;
@@ -6930,16 +6949,14 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
int ret = ext4_trim_extent(sb, start, next - start, e4b);
if (ret && ret != -EOPNOTSUPP)
- break;
+ return count;
count += next - start;
}
free_count += next - start;
start = next + 1;
- if (fatal_signal_pending(current)) {
- count = -ERESTARTSYS;
- break;
- }
+ if (ext4_trim_interrupted())
+ return count;
if (need_resched()) {
ext4_unlock_group(sb, e4b->bd_group);
@@ -6951,6 +6968,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
break;
}
+ if (set_trimmed)
+ EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);
+
return count;
}
@@ -6961,7 +6981,6 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
* @start: first group block to examine
* @max: last group block to examine
* @minblocks: minimum extent block count
- * @set_trimmed: set the trimmed flag if at least one block is trimmed
*
* ext4_trim_all_free walks through group's block bitmap searching for free
* extents. When the free extent is found, mark it as used in group buddy
@@ -6971,7 +6990,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t max,
- ext4_grpblk_t minblocks, bool set_trimmed)
+ ext4_grpblk_t minblocks)
{
struct ext4_buddy e4b;
int ret;
@@ -6988,13 +7007,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_lock_group(sb, group);
if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
- minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
+ minblocks < EXT4_SB(sb)->s_last_trim_minblks)
ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
- if (ret >= 0 && set_trimmed)
- EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
- } else {
+ else
ret = 0;
- }
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
@@ -7027,7 +7043,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
ext4_fsblk_t first_data_blk =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
- bool whole_group, eof = false;
int ret = 0;
start = range->start >> sb->s_blocksize_bits;
@@ -7046,10 +7061,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
goto out;
}
- if (end >= max_blks - 1) {
+ if (end >= max_blks - 1)
end = max_blks - 1;
- eof = true;
- }
if (end <= first_data_blk)
goto out;
if (start < first_data_blk)
@@ -7063,9 +7076,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
/* end now represents the last cluster to discard in this group */
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
- whole_group = true;
for (group = first_group; group <= last_group; group++) {
+ if (ext4_trim_interrupted())
+ break;
grp = ext4_get_group_info(sb, group);
if (!grp)
continue;
@@ -7082,13 +7096,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
* change it for the last group, note that last_cluster is
* already computed earlier by ext4_get_group_no_and_offset()
*/
- if (group == last_group) {
+ if (group == last_group)
end = last_cluster;
- whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1;
- }
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- end, minlen, whole_group);
+ end, minlen);
if (cnt < 0) {
ret = cnt;
break;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 41a6411c600b..bbda587f76b8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -343,17 +343,17 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
+ int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
#ifdef PARANOID
struct ext4_dir_entry *d, *top;
d = (struct ext4_dir_entry *)bh->b_data;
top = (struct ext4_dir_entry *)(bh->b_data +
- (EXT4_BLOCK_SIZE(inode->i_sb) -
- sizeof(struct ext4_dir_entry_tail)));
- while (d < top && d->rec_len)
+ (blocksize - sizeof(struct ext4_dir_entry_tail)));
+ while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
d = (struct ext4_dir_entry *)(((void *)d) +
- le16_to_cpu(d->rec_len));
+ ext4_rec_len_from_disk(d->rec_len, blocksize));
if (d != top)
return NULL;
@@ -364,7 +364,8 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
#endif
if (t->det_reserved_zero1 ||
- le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
+ (ext4_rec_len_from_disk(t->det_rec_len, blocksize) !=
+ sizeof(struct ext4_dir_entry_tail)) ||
t->det_reserved_zero2 ||
t->det_reserved_ft != EXT4_FT_DIR_CSUM)
return NULL;
@@ -445,13 +446,14 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
struct ext4_dir_entry *dp;
struct dx_root_info *root;
int count_offset;
+ int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
+ unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize);
- if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+ if (rlen == blocksize)
count_offset = 8;
- else if (le16_to_cpu(dirent->rec_len) == 12) {
+ else if (rlen == 12) {
dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
- if (le16_to_cpu(dp->rec_len) !=
- EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+ if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
return NULL;
root = (struct dx_root_info *)(((void *)dp + 12));
if (root->reserved_zero ||
@@ -1315,6 +1317,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
unsigned int buflen = bh->b_size;
char *base = bh->b_data;
struct dx_hash_info h = *hinfo;
+ int blocksize = EXT4_BLOCK_SIZE(dir->i_sb);
if (ext4_has_metadata_csum(dir->i_sb))
buflen -= sizeof(struct ext4_dir_entry_tail);
@@ -1335,11 +1338,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
map_tail--;
map_tail->hash = h.hash;
map_tail->offs = ((char *) de - base)>>2;
- map_tail->size = le16_to_cpu(de->rec_len);
+ map_tail->size = ext4_rec_len_from_disk(de->rec_len,
+ blocksize);
count++;
cond_resched();
}
- de = ext4_next_entry(de, dir->i_sb->s_blocksize);
+ de = ext4_next_entry(de, blocksize);
}
return count;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 38217422f938..dbebd8b3127e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -7314,7 +7314,7 @@ static struct file_system_type ext4_fs_type = {
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
.kill_sb = ext4_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("ext4");
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 969ce991b0b0..c1af01b2c42d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1535,10 +1535,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
if (wbc->pages_skipped) {
/*
- * writeback is not making progress due to locked
- * buffers. Skip this inode for now.
+ * Writeback is not making progress due to locked buffers.
+ * Skip this inode for now. Although having skipped pages
+ * is odd for clean inodes, it can happen for some
+ * filesystems so handle that gracefully.
*/
- redirty_tail_locked(inode, wb);
+ if (inode->i_state & I_DIRTY_ALL)
+ redirty_tail_locked(inode, wb);
+ else
+ inode_cgwb_move_to_attached(inode, wb);
return;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9cbf8d98489a..4a280be229a6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2010,7 +2010,9 @@ static long gfs2_scan_glock_lru(int nr)
if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
if (!spin_trylock(&gl->gl_lockref.lock))
continue;
- if (!gl->gl_lockref.count) {
+ if (gl->gl_lockref.count <= 1 &&
+ (gl->gl_state == LM_ST_UNLOCKED ||
+ demote_ok(gl))) {
list_move(&gl->gl_lru, &dispose);
atomic_dec(&lru_count);
freed++;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d26759a98b10..f41ca89d216b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -567,15 +567,16 @@ static void freeze_go_callback(struct gfs2_glock *gl, bool remote)
struct super_block *sb = sdp->sd_vfs;
if (!remote ||
- gl->gl_state != LM_ST_SHARED ||
+ (gl->gl_state != LM_ST_SHARED &&
+ gl->gl_state != LM_ST_UNLOCKED) ||
gl->gl_demote_state != LM_ST_UNLOCKED)
return;
/*
* Try to get an active super block reference to prevent racing with
- * unmount (see trylock_super()). But note that unmount isn't the only
- * place where a write lock on s_umount is taken, and we can fail here
- * because of things like remount as well.
+ * unmount (see super_trylock_shared()). But note that unmount isn't
+ * the only place where a write lock on s_umount is taken, and we can
+ * fail here because of things like remount as well.
*/
if (down_read_trylock(&sb->s_umount)) {
atomic_inc(&sb->s_active);
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 21ada332d555..1429945215a0 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -50,7 +50,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (ret)
return ret;
- if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
+ sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
return 0;
ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap);
if (ret)
diff --git a/fs/inode.c b/fs/inode.c
index 35fd688168c5..84bc3c76e5cc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2102,52 +2102,10 @@ int file_remove_privs(struct file *file)
}
EXPORT_SYMBOL(file_remove_privs);
-/**
- * current_mgtime - Return FS time (possibly fine-grained)
- * @inode: inode.
- *
- * Return the current time truncated to the time granularity supported by
- * the fs, as suitable for a ctime/mtime change. If the ctime is flagged
- * as having been QUERIED, get a fine-grained timestamp.
- */
-struct timespec64 current_mgtime(struct inode *inode)
-{
- struct timespec64 now, ctime;
- atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec;
- long nsec = atomic_long_read(pnsec);
-
- if (nsec & I_CTIME_QUERIED) {
- ktime_get_real_ts64(&now);
- return timestamp_truncate(now, inode);
- }
-
- ktime_get_coarse_real_ts64(&now);
- now = timestamp_truncate(now, inode);
-
- /*
- * If we've recently fetched a fine-grained timestamp
- * then the coarse-grained one may still be earlier than the
- * existing ctime. Just keep the existing value if so.
- */
- ctime = inode_get_ctime(inode);
- if (timespec64_compare(&ctime, &now) > 0)
- now = ctime;
-
- return now;
-}
-EXPORT_SYMBOL(current_mgtime);
-
-static struct timespec64 current_ctime(struct inode *inode)
-{
- if (is_mgtime(inode))
- return current_mgtime(inode);
- return current_time(inode);
-}
-
static int inode_needs_update_time(struct inode *inode)
{
int sync_it = 0;
- struct timespec64 now = current_ctime(inode);
+ struct timespec64 now = current_time(inode);
struct timespec64 ctime;
/* First try to exhaust all avenues to not sync */
@@ -2578,43 +2536,9 @@ EXPORT_SYMBOL(current_time);
*/
struct timespec64 inode_set_ctime_current(struct inode *inode)
{
- struct timespec64 now;
- struct timespec64 ctime;
-
- ctime.tv_nsec = READ_ONCE(inode->__i_ctime.tv_nsec);
- if (!(ctime.tv_nsec & I_CTIME_QUERIED)) {
- now = current_time(inode);
+ struct timespec64 now = current_time(inode);
- /* Just copy it into place if it's not multigrain */
- if (!is_mgtime(inode)) {
- inode_set_ctime_to_ts(inode, now);
- return now;
- }
-
- /*
- * If we've recently updated with a fine-grained timestamp,
- * then the coarse-grained one may still be earlier than the
- * existing ctime. Just keep the existing value if so.
- */
- ctime.tv_sec = inode->__i_ctime.tv_sec;
- if (timespec64_compare(&ctime, &now) > 0)
- return ctime;
-
- /*
- * Ctime updates are usually protected by the inode_lock, but
- * we can still race with someone setting the QUERIED flag.
- * Try to swap the new nsec value into place. If it's changed
- * in the interim, then just go with a fine-grained timestamp.
- */
- if (cmpxchg(&inode->__i_ctime.tv_nsec, ctime.tv_nsec,
- now.tv_nsec) != ctime.tv_nsec)
- goto fine_grained;
- inode->__i_ctime.tv_sec = now.tv_sec;
- return now;
- }
-fine_grained:
- ktime_get_real_ts64(&now);
- inode_set_ctime_to_ts(inode, timestamp_truncate(now, inode));
+ inode_set_ctime(inode, now.tv_sec, now.tv_nsec);
return now;
}
EXPORT_SYMBOL(inode_set_ctime_current);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index ae8673ce08b1..5db54ca29a35 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -640,11 +640,13 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t poff, plen;
/*
- * If the write completely overlaps the current folio, then
+ * If the write or zeroing completely overlaps the current folio, then
* entire folio will be dirtied so there is no need for
* per-block state tracking structures to be attached to this folio.
+ * For the unshare case, we must read in the ondisk contents because we
+ * are not changing pagecache contents.
*/
- if (pos <= folio_pos(folio) &&
+ if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) &&
pos + len >= folio_pos(folio) + folio_size(folio))
return 0;
@@ -1047,7 +1049,7 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
/*
* Scan the data range passed to us for dirty page cache folios. If we find a
- * dirty folio, punch out the preceeding range and update the offset from which
+ * dirty folio, punch out the preceding range and update the offset from which
* the next punch will start from.
*
* We can punch out storage reservations under clean pages because they either
@@ -1261,7 +1263,6 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
- long status = 0;
loff_t written = 0;
/* don't bother with blocks that are not shared to start with */
@@ -1272,28 +1273,33 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
return length;
do {
- unsigned long offset = offset_in_page(pos);
- unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
struct folio *folio;
+ int status;
+ size_t offset;
+ size_t bytes = min_t(u64, SIZE_MAX, length);
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
return status;
- if (iter->iomap.flags & IOMAP_F_STALE)
+ if (iomap->flags & IOMAP_F_STALE)
break;
- status = iomap_write_end(iter, pos, bytes, bytes, folio);
- if (WARN_ON_ONCE(status == 0))
+ offset = offset_in_folio(folio, pos);
+ if (bytes > folio_size(folio) - offset)
+ bytes = folio_size(folio) - offset;
+
+ bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+ if (WARN_ON_ONCE(bytes == 0))
return -EIO;
cond_resched();
- pos += status;
- written += status;
- length -= status;
+ pos += bytes;
+ written += bytes;
+ length -= bytes;
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
- } while (length);
+ } while (length > 0);
return written;
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 1073259902a6..8d6f934c3d95 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -298,14 +298,12 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
{
- struct page *page = bh->b_page;
char *addr;
__u32 checksum;
- addr = kmap_atomic(page);
- checksum = crc32_be(crc32_sum,
- (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
- kunmap_atomic(addr);
+ addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
+ checksum = crc32_be(crc32_sum, addr, bh->b_size);
+ kunmap_local(addr);
return checksum;
}
@@ -322,7 +320,6 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
struct buffer_head *bh, __u32 sequence)
{
journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
- struct page *page = bh->b_page;
__u8 *addr;
__u32 csum32;
__be32 seq;
@@ -331,11 +328,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
return;
seq = cpu_to_be32(sequence);
- addr = kmap_atomic(page);
+ addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
- csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
- bh->b_size);
- kunmap_atomic(addr);
+ csum32 = jbd2_chksum(j, csum32, addr, bh->b_size);
+ kunmap_local(addr);
if (jbd2_has_feature_csum3(j))
tag3->t_checksum = cpu_to_be32(csum32);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 768fa05bcbed..30dec2bd2ecc 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1601,6 +1601,8 @@ static journal_t *journal_init_common(struct block_device *bdev,
err_cleanup:
percpu_counter_destroy(&journal->j_checkpoint_jh_count);
+ if (journal->j_chksum_driver)
+ crypto_free_shash(journal->j_chksum_driver);
kfree(journal->j_wbuf);
jbd2_journal_destroy_revoke(journal);
journal_fail_superblock(journal);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4d1fda1f7143..5f08b5fd105a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -935,19 +935,15 @@ static void warn_dirty_buffer(struct buffer_head *bh)
/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
static void jbd2_freeze_jh_data(struct journal_head *jh)
{
- struct page *page;
- int offset;
char *source;
struct buffer_head *bh = jh2bh(jh);
J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
- page = bh->b_page;
- offset = offset_in_page(bh->b_data);
- source = kmap_atomic(page);
+ source = kmap_local_folio(bh->b_folio, bh_offset(bh));
/* Fire data frozen trigger just before we copy the data */
- jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers);
- memcpy(jh->b_frozen_data, source + offset, bh->b_size);
- kunmap_atomic(source);
+ jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers);
+ memcpy(jh->b_frozen_data, source, bh->b_size);
+ kunmap_local(source);
/*
* Now that the frozen data is saved off, we need to store any matching
diff --git a/fs/libfs.c b/fs/libfs.c
index a4eb12757886..37f2d34ee090 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1903,6 +1903,7 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
* We don't know how much we wrote, so just return the number of
* bytes which were direct-written
*/
+ iocb->ki_pos -= buffered_written;
if (direct_written)
return direct_written;
return err;
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 3404707ddbe7..2cd3ccf4c439 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -47,12 +47,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
xas_for_each(&xas, folio, last_page) {
loff_t pg_end;
bool pg_failed = false;
+ bool folio_started;
if (xas_retry(&xas, folio))
continue;
pg_end = folio_pos(folio) + folio_size(folio) - 1;
+ folio_started = false;
for (;;) {
loff_t sreq_end;
@@ -60,8 +62,10 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
pg_failed = true;
break;
}
- if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
+ if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
folio_start_fscache(folio);
+ folio_started = true;
+ }
pg_failed |= subreq_failed;
sreq_end = subreq->start + subreq->len - 1;
if (pg_end < sreq_end)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 47d892a1d363..f6c74f424691 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -93,12 +93,10 @@ nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
dreq->max_count = dreq_len;
if (dreq->count > dreq_len)
dreq->count = dreq_len;
-
- if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
- dreq->error = hdr->error;
- else /* Clear outstanding error if this is EOF */
- dreq->error = 0;
}
+
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error)
+ dreq->error = hdr->error;
}
static void
@@ -120,6 +118,18 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq,
dreq->count = dreq_len;
}
+static void nfs_direct_truncate_request(struct nfs_direct_req *dreq,
+ struct nfs_page *req)
+{
+ loff_t offs = req_offset(req);
+ size_t req_start = (size_t)(offs - dreq->io_start);
+
+ if (req_start < dreq->max_count)
+ dreq->max_count = req_start;
+ if (req_start < dreq->count)
+ dreq->count = req_start;
+}
+
/**
* nfs_swap_rw - NFS address space operation for swap I/O
* @iocb: target I/O control block
@@ -488,7 +498,9 @@ static void nfs_direct_add_page_head(struct list_head *list,
kref_get(&head->wb_kref);
}
-static void nfs_direct_join_group(struct list_head *list, struct inode *inode)
+static void nfs_direct_join_group(struct list_head *list,
+ struct nfs_commit_info *cinfo,
+ struct inode *inode)
{
struct nfs_page *req, *subreq;
@@ -510,7 +522,7 @@ static void nfs_direct_join_group(struct list_head *list, struct inode *inode)
nfs_release_request(subreq);
}
} while ((subreq = subreq->wb_this_page) != req);
- nfs_join_page_group(req, inode);
+ nfs_join_page_group(req, cinfo, inode);
}
}
@@ -528,20 +540,15 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
struct nfs_pageio_descriptor desc;
- struct nfs_page *req, *tmp;
+ struct nfs_page *req;
LIST_HEAD(reqs);
struct nfs_commit_info cinfo;
- LIST_HEAD(failed);
nfs_init_cinfo_from_dreq(&cinfo, dreq);
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
- nfs_direct_join_group(&reqs, dreq->inode);
+ nfs_direct_join_group(&reqs, &cinfo, dreq->inode);
- dreq->count = 0;
- dreq->max_count = 0;
- list_for_each_entry(req, &reqs, wb_list)
- dreq->max_count += req->wb_bytes;
nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
get_dreq(dreq);
@@ -549,27 +556,40 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
- list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
/* Bump the transmission count */
req->wb_nio++;
if (!nfs_pageio_add_request(&desc, req)) {
- nfs_list_move_request(req, &failed);
- spin_lock(&cinfo.inode->i_lock);
- dreq->flags = 0;
- if (desc.pg_error < 0)
+ spin_lock(&dreq->lock);
+ if (dreq->error < 0) {
+ desc.pg_error = dreq->error;
+ } else if (desc.pg_error != -EAGAIN) {
+ dreq->flags = 0;
+ if (!desc.pg_error)
+ desc.pg_error = -EIO;
dreq->error = desc.pg_error;
- else
- dreq->error = -EIO;
- spin_unlock(&cinfo.inode->i_lock);
+ } else
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ break;
}
nfs_release_request(req);
}
nfs_pageio_complete(&desc);
- while (!list_empty(&failed)) {
- req = nfs_list_entry(failed.next);
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
nfs_list_remove_request(req);
nfs_unlock_and_release_request(req);
+ if (desc.pg_error == -EAGAIN) {
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ } else {
+ spin_lock(&dreq->lock);
+ nfs_direct_truncate_request(dreq, req);
+ spin_unlock(&dreq->lock);
+ nfs_release_request(req);
+ }
}
if (put_dreq(dreq))
@@ -589,8 +609,6 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
if (status < 0) {
/* Errors in commit are fatal */
dreq->error = status;
- dreq->max_count = 0;
- dreq->count = 0;
dreq->flags = NFS_ODIRECT_DONE;
} else {
status = dreq->error;
@@ -601,7 +619,12 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- if (status >= 0 && !nfs_write_match_verf(verf, req)) {
+ if (status < 0) {
+ spin_lock(&dreq->lock);
+ nfs_direct_truncate_request(dreq, req);
+ spin_unlock(&dreq->lock);
+ nfs_release_request(req);
+ } else if (!nfs_write_match_verf(verf, req)) {
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
/*
* Despite the reboot, the write was successful,
@@ -609,7 +632,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
*/
req->wb_nio = 0;
nfs_mark_request_commit(req, NULL, &cinfo, 0);
- } else /* Error or match */
+ } else
nfs_release_request(req);
nfs_unlock_and_release_request(req);
}
@@ -662,6 +685,7 @@ static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
while (!list_empty(&reqs)) {
req = nfs_list_entry(reqs.next);
nfs_list_remove_request(req);
+ nfs_direct_truncate_request(dreq, req);
nfs_release_request(req);
nfs_unlock_and_release_request(req);
}
@@ -711,7 +735,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
}
nfs_direct_count_bytes(dreq, hdr);
- if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) {
+ if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) &&
+ !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
if (!dreq->flags)
dreq->flags = NFS_ODIRECT_DO_COMMIT;
flags = dreq->flags;
@@ -755,18 +780,23 @@ static void nfs_write_sync_pgio_error(struct list_head *head, int error)
static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
+ struct nfs_page *req;
+ struct nfs_commit_info cinfo;
trace_nfs_direct_write_reschedule_io(dreq);
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
spin_lock(&dreq->lock);
- if (dreq->error == 0) {
+ if (dreq->error == 0)
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- /* fake unstable write to let common nfs resend pages */
- hdr->verf.committed = NFS_UNSTABLE;
- hdr->good_bytes = hdr->args.offset + hdr->args.count -
- hdr->io_start;
- }
+ set_bit(NFS_IOHDR_REDO, &hdr->flags);
spin_unlock(&dreq->lock);
+ while (!list_empty(&hdr->pages)) {
+ req = nfs_list_entry(hdr->pages.next);
+ nfs_list_remove_request(req);
+ nfs_unlock_request(req);
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ }
}
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
@@ -794,9 +824,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
{
struct nfs_pageio_descriptor desc;
struct inode *inode = dreq->inode;
+ struct nfs_commit_info cinfo;
ssize_t result = 0;
size_t requested_bytes = 0;
size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
+ bool defer = false;
trace_nfs_direct_write_schedule_iovec(dreq);
@@ -837,17 +869,37 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
break;
}
- nfs_lock_request(req);
- if (!nfs_pageio_add_request(&desc, req)) {
- result = desc.pg_error;
- nfs_unlock_and_release_request(req);
- break;
- }
pgbase = 0;
bytes -= req_len;
requested_bytes += req_len;
pos += req_len;
dreq->bytes_left -= req_len;
+
+ if (defer) {
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ continue;
+ }
+
+ nfs_lock_request(req);
+ if (nfs_pageio_add_request(&desc, req))
+ continue;
+
+ /* Exit on hard errors */
+ if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) {
+ result = desc.pg_error;
+ nfs_unlock_and_release_request(req);
+ break;
+ }
+
+ /* If the error is soft, defer remaining requests */
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ spin_lock(&dreq->lock);
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ nfs_unlock_request(req);
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ desc.pg_error = 0;
+ defer = true;
}
nfs_direct_release_pages(pagevec, npages);
kvfree(pagevec);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 7deb3cd76abe..a1dc33864906 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1235,6 +1235,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case -EPFNOSUPPORT:
case -EPROTONOSUPPORT:
case -EOPNOTSUPP:
+ case -EINVAL:
case -ECONNREFUSED:
case -ECONNRESET:
case -EHOSTDOWN:
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 27fb25567ce7..11e3a285594c 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -417,6 +417,8 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old)
.net = old->cl_net,
.servername = old->cl_hostname,
};
+ int max_connect = test_bit(NFS_CS_PNFS, &clp->cl_flags) ?
+ clp->cl_max_connect : old->cl_max_connect;
if (clp->cl_proto != old->cl_proto)
return;
@@ -430,7 +432,7 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old)
xprt_args.addrlen = clp_salen;
rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args,
- rpc_clnt_test_and_add_xprt, NULL);
+ rpc_clnt_test_and_add_xprt, &max_connect);
}
/**
@@ -1010,6 +1012,8 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
__set_bit(NFS_CS_DS, &cl_init.init_flags);
+ __set_bit(NFS_CS_PNFS, &cl_init.init_flags);
+ cl_init.max_connect = NFS_MAX_TRANSPORTS;
/*
* Set an authflavor equual to the MDS value. Use the MDS nfs_client
* cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 794343790ea8..7016eaadf555 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2703,8 +2703,12 @@ static int _nfs4_proc_open(struct nfs4_opendata *data,
return status;
}
if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) {
+ struct nfs_fh *fh = &o_res->fh;
+
nfs4_sequence_free_slot(&o_res->seq_res);
- nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, NULL);
+ if (o_arg->claim == NFS4_OPEN_CLAIM_FH)
+ fh = NFS_FH(d_inode(data->dentry));
+ nfs4_proc_getattr(server, fh, o_res->f_attr, NULL);
}
return 0;
}
@@ -10618,7 +10622,9 @@ static void nfs4_disable_swap(struct inode *inode)
*/
struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
- nfs4_schedule_state_manager(clp);
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ wake_up_var(&clp->cl_state);
}
static const struct inode_operations nfs4_dir_inode_operations = {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e079987af4a3..9a5d911a7edc 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
{
struct task_struct *task;
char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
+ struct rpc_clnt *clnt = clp->cl_rpcclient;
+ bool swapon = false;
- if (clp->cl_rpcclient->cl_shutdown)
+ if (clnt->cl_shutdown)
return;
set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
- if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) {
- wake_up_var(&clp->cl_state);
- return;
+
+ if (atomic_read(&clnt->cl_swapper)) {
+ swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE,
+ &clp->cl_state);
+ if (!swapon) {
+ wake_up_var(&clp->cl_state);
+ return;
+ }
}
- set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
+
+ if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+ return;
+
__module_get(THIS_MODULE);
refcount_inc(&clp->cl_count);
@@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
__func__, PTR_ERR(task));
if (!nfs_client_init_is_complete(clp))
nfs_mark_client_ready(clp, PTR_ERR(task));
+ if (swapon)
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
nfs4_clear_state_manager_bit(clp);
- clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
nfs_put_client(clp);
module_put(THIS_MODULE);
}
@@ -2703,6 +2714,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
+ if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING,
+ &clp->cl_state)) {
+ memflags = memalloc_nofs_save();
+ continue;
+ }
+
if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) {
if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
nfs_client_return_marked_delegations(clp);
@@ -2741,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr)
allow_signal(SIGKILL);
again:
- set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
nfs4_state_manager(clp);
- if (atomic_read(&cl->cl_swapper)) {
+
+ if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) &&
+ !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) {
wait_var_event_interruptible(&clp->cl_state,
test_bit(NFS4CLNT_RUN_MANAGER,
&clp->cl_state));
- if (atomic_read(&cl->cl_swapper) &&
- test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
+ if (!atomic_read(&cl->cl_swapper))
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
goto again;
/* Either no longer a swapper, or were signalled */
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
}
- clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
- !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state))
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
goto again;
nfs_put_client(clp);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f4cca8f00c0c..7720b5e43014 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -59,7 +59,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
static const struct nfs_rw_ops nfs_rw_write_ops;
static void nfs_inode_remove_request(struct nfs_page *req);
-static void nfs_clear_request_commit(struct nfs_page *req);
+static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
+ struct nfs_page *req);
static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
struct inode *inode);
static struct nfs_page *
@@ -502,8 +503,8 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
* the (former) group. All subrequests are removed from any write or commit
* lists, unlinked from the group and destroyed.
*/
-void
-nfs_join_page_group(struct nfs_page *head, struct inode *inode)
+void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo,
+ struct inode *inode)
{
struct nfs_page *subreq;
struct nfs_page *destroy_list = NULL;
@@ -533,7 +534,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode)
* Commit list removal accounting is done after locks are dropped */
subreq = head;
do {
- nfs_clear_request_commit(subreq);
+ nfs_clear_request_commit(cinfo, subreq);
subreq = subreq->wb_this_page;
} while (subreq != head);
@@ -566,8 +567,10 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
{
struct inode *inode = folio_file_mapping(folio)->host;
struct nfs_page *head;
+ struct nfs_commit_info cinfo;
int ret;
+ nfs_init_cinfo_from_inode(&cinfo, inode);
/*
* A reference is taken only on the head request which acts as a
* reference to the whole page group - the group will not be destroyed
@@ -584,7 +587,7 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
return ERR_PTR(ret);
}
- nfs_join_page_group(head, inode);
+ nfs_join_page_group(head, &cinfo, inode);
return head;
}
@@ -799,8 +802,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
}
if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
- nfs_release_request(req);
atomic_long_dec(&NFS_I(nfs_page_to_inode(req))->nrequests);
+ nfs_release_request(req);
}
}
@@ -955,18 +958,16 @@ static void nfs_folio_clear_commit(struct folio *folio)
}
/* Called holding the request lock on @req */
-static void
-nfs_clear_request_commit(struct nfs_page *req)
+static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
{
if (test_bit(PG_CLEAN, &req->wb_flags)) {
struct nfs_open_context *ctx = nfs_req_openctx(req);
struct inode *inode = d_inode(ctx->dentry);
- struct nfs_commit_info cinfo;
- nfs_init_cinfo_from_inode(&cinfo, inode);
mutex_lock(&NFS_I(inode)->commit_mutex);
- if (!pnfs_clear_request_commit(req, &cinfo)) {
- nfs_request_remove_commit_list(req, &cinfo);
+ if (!pnfs_clear_request_commit(req, cinfo)) {
+ nfs_request_remove_commit_list(req, cinfo);
}
mutex_unlock(&NFS_I(inode)->commit_mutex);
nfs_folio_clear_commit(nfs_page_to_folio(req));
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5ca748309c26..4199ede0583c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1058,8 +1058,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
rename->rn_tname, rename->rn_tnamelen);
if (status)
return status;
- set_change_info(&rename->rn_sinfo, &cstate->current_fh);
- set_change_info(&rename->rn_tinfo, &cstate->save_fh);
+ set_change_info(&rename->rn_sinfo, &cstate->save_fh);
+ set_change_info(&rename->rn_tinfo, &cstate->current_fh);
return nfs_ok;
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2e40c74d2f72..92c7dde148a4 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4113,6 +4113,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
struct file *file, unsigned long maxcount)
{
struct xdr_stream *xdr = resp->xdr;
+ unsigned int base = xdr->buf->page_len & ~PAGE_MASK;
unsigned int starting_len = xdr->buf->len;
__be32 zero = xdr_zero;
__be32 nfserr;
@@ -4121,8 +4122,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
return nfserr_resource;
nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file,
- read->rd_offset, &maxcount,
- xdr->buf->page_len & ~PAGE_MASK,
+ read->rd_offset, &maxcount, base,
&read->rd_eof);
read->rd_length = maxcount;
if (nfserr)
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1582af33e204..c7af1095f6b5 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -1082,11 +1082,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
int nfsd_pool_stats_release(struct inode *inode, struct file *file)
{
+ struct seq_file *seq = file->private_data;
+ struct svc_serv *serv = seq->private;
int ret = seq_release(inode, file);
- struct net *net = inode->i_sb->s_fs_info;
mutex_lock(&nfsd_mutex);
- nfsd_put(net);
+ svc_put(serv);
mutex_unlock(&nfsd_mutex);
return ret;
}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 48fe71d309cb..8beb2730929d 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -73,10 +73,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn);
- if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
- brelse(bh);
+ if (unlikely(err)) /* -EIO, -ENOMEM, -ENOENT */
goto failed;
- }
}
lock_buffer(bh);
@@ -102,6 +100,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
failed:
unlock_page(bh->b_page);
put_page(bh->b_page);
+ if (unlikely(err))
+ brelse(bh);
return err;
}
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index cfec5e0c7f66..5661a363005e 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -1562,6 +1562,7 @@ load_root:
put_inode_out:
iput(inode);
out:
+ ntfs3_put_sbi(sbi);
kfree(boot2);
return err;
}
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index bae404a1bad4..ada3fcc9c6d5 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -337,7 +337,7 @@ static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry,
{
struct iattr attr = {
.ia_valid =
- ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
+ ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
@@ -618,7 +618,8 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
if (err)
return err;
- if (inode->i_flags & OVL_COPY_I_FLAGS_MASK) {
+ if (inode->i_flags & OVL_COPY_I_FLAGS_MASK &&
+ (S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) {
/*
* Copy the fileattr inode flags that are the source of already
* copied i_flags
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index c8c8588bd98c..26b782c53910 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -188,7 +188,7 @@ static int ovl_check_encode_origin(struct dentry *dentry)
/* Lower file handle for non-upper non-decodable */
if (!ovl_dentry_upper(dentry) && !decodable)
- return 0;
+ return 1;
/* Upper file handle for pure upper */
if (!ovl_dentry_lower(dentry))
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 3b4cc633d763..8be4dc050d1e 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -19,7 +19,6 @@ struct ovl_aio_req {
struct kiocb iocb;
refcount_t ref;
struct kiocb *orig_iocb;
- struct fd fd;
};
static struct kmem_cache *ovl_aio_request_cachep;
@@ -280,7 +279,7 @@ static rwf_t ovl_iocb_to_rwf(int ifl)
static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
{
if (refcount_dec_and_test(&aio_req->ref)) {
- fdput(aio_req->fd);
+ fput(aio_req->iocb.ki_filp);
kmem_cache_free(ovl_aio_request_cachep, aio_req);
}
}
@@ -342,10 +341,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!aio_req)
goto out;
- aio_req->fd = real;
- real.flags = 0;
aio_req->orig_iocb = iocb;
- kiocb_clone(&aio_req->iocb, iocb, real.file);
+ kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
refcount_set(&aio_req->ref, 2);
ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
@@ -393,6 +390,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
+ /*
+ * Overlayfs doesn't support deferred completions, don't copy
+ * this property in case it is set by the issuer.
+ */
+ ifl &= ~IOCB_DIO_CALLER_COMP;
+
old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) {
file_start_write(real.file);
@@ -409,10 +412,8 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!aio_req)
goto out;
- aio_req->fd = real;
- real.flags = 0;
aio_req->orig_iocb = iocb;
- kiocb_clone(&aio_req->iocb, iocb, real.file);
+ kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
aio_req->iocb.ki_flags = ifl;
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
refcount_set(&aio_req->ref, 2);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index e9539f98e86a..d82d2a043da2 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -8,6 +8,7 @@
struct ovl_config {
char *upperdir;
char *workdir;
+ char **lowerdirs;
bool default_permissions;
int redirect_mode;
int verity_mode;
@@ -39,17 +40,8 @@ struct ovl_layer {
int idx;
/* One fsid per unique underlying sb (upper fsid == 0) */
int fsid;
- char *name;
};
-/*
- * ovl_free_fs() relies on @mnt being the first member when unmounting
- * the private mounts created for each layer. Let's check both the
- * offset and type.
- */
-static_assert(offsetof(struct ovl_layer, mnt) == 0);
-static_assert(__same_type(typeof_member(struct ovl_layer, mnt), struct vfsmount *));
-
struct ovl_path {
const struct ovl_layer *layer;
struct dentry *dentry;
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index b9355bb6d75a..95b751507ac8 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -752,12 +752,12 @@ void ovl_free_fs(struct ovl_fs *ofs)
if (ofs->upperdir_locked)
ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root);
- /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */
- mounts = (struct vfsmount **) ofs->layers;
+ /* Reuse ofs->config.lowerdirs as a vfsmount array before freeing it */
+ mounts = (struct vfsmount **) ofs->config.lowerdirs;
for (i = 0; i < ofs->numlayer; i++) {
iput(ofs->layers[i].trap);
+ kfree(ofs->config.lowerdirs[i]);
mounts[i] = ofs->layers[i].mnt;
- kfree(ofs->layers[i].name);
}
kern_unmount_array(mounts, ofs->numlayer);
kfree(ofs->layers);
@@ -765,6 +765,7 @@ void ovl_free_fs(struct ovl_fs *ofs)
free_anon_bdev(ofs->fs[i].pseudo_dev);
kfree(ofs->fs);
+ kfree(ofs->config.lowerdirs);
kfree(ofs->config.upperdir);
kfree(ofs->config.workdir);
if (ofs->creator_cred)
@@ -949,16 +950,16 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry)
struct super_block *sb = dentry->d_sb;
struct ovl_fs *ofs = OVL_FS(sb);
size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer;
- const struct ovl_layer *data_layers = &ofs->layers[nr_merged_lower];
+ char **lowerdatadirs = &ofs->config.lowerdirs[nr_merged_lower];
- /* ofs->layers[0] is the upper layer */
- seq_printf(m, ",lowerdir=%s", ofs->layers[1].name);
+ /* lowerdirs[] starts from offset 1 */
+ seq_printf(m, ",lowerdir=%s", ofs->config.lowerdirs[1]);
/* dump regular lower layers */
for (nr = 2; nr < nr_merged_lower; nr++)
- seq_printf(m, ":%s", ofs->layers[nr].name);
+ seq_printf(m, ":%s", ofs->config.lowerdirs[nr]);
/* dump data lower layers */
for (nr = 0; nr < ofs->numdatalayer; nr++)
- seq_printf(m, "::%s", data_layers[nr].name);
+ seq_printf(m, "::%s", lowerdatadirs[nr]);
if (ofs->config.upperdir) {
seq_show_option(m, "upperdir", ofs->config.upperdir);
seq_show_option(m, "workdir", ofs->config.workdir);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index def266b5e2a3..3fa2416264a4 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -104,8 +104,8 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
static int ovl_dentry_revalidate_common(struct dentry *dentry,
unsigned int flags, bool weak)
{
- struct ovl_entry *oe = OVL_E(dentry);
- struct ovl_path *lowerstack = ovl_lowerstack(oe);
+ struct ovl_entry *oe;
+ struct ovl_path *lowerstack;
struct inode *inode = d_inode_rcu(dentry);
struct dentry *upper;
unsigned int i;
@@ -115,6 +115,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry,
if (!inode)
return -ECHILD;
+ oe = OVL_I_E(inode);
+ lowerstack = ovl_lowerstack(oe);
upper = ovl_i_dentry_upper(inode);
if (upper)
ret = ovl_revalidate_real(upper, flags, weak);
@@ -167,6 +169,7 @@ static void ovl_free_inode(struct inode *inode)
struct ovl_inode *oi = OVL_I(inode);
kfree(oi->redirect);
+ kfree(oi->oe);
mutex_destroy(&oi->lock);
kmem_cache_free(ovl_inode_cachep, oi);
}
@@ -176,7 +179,7 @@ static void ovl_destroy_inode(struct inode *inode)
struct ovl_inode *oi = OVL_I(inode);
dput(oi->__upperdentry);
- ovl_free_entry(oi->oe);
+ ovl_stack_put(ovl_lowerstack(oi->oe), ovl_numlower(oi->oe));
if (S_ISDIR(inode->i_mode))
ovl_dir_cache_free(inode);
else
@@ -569,11 +572,6 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
upper_layer->idx = 0;
upper_layer->fsid = 0;
- err = -ENOMEM;
- upper_layer->name = kstrdup(ofs->config.upperdir, GFP_KERNEL);
- if (!upper_layer->name)
- goto out;
-
/*
* Inherit SB_NOSEC flag from upperdir.
*
@@ -1122,7 +1120,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
layers[ofs->numlayer].idx = ofs->numlayer;
layers[ofs->numlayer].fsid = fsid;
layers[ofs->numlayer].fs = &ofs->fs[fsid];
- layers[ofs->numlayer].name = l->name;
+ /* Store for printing lowerdir=... in ovl_show_options() */
+ ofs->config.lowerdirs[ofs->numlayer] = l->name;
l->name = NULL;
ofs->numlayer++;
ofs->fs[fsid].is_lower = true;
@@ -1367,8 +1366,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
if (!layers)
goto out_err;
+ ofs->config.lowerdirs = kcalloc(ctx->nr + 1, sizeof(char *), GFP_KERNEL);
+ if (!ofs->config.lowerdirs) {
+ kfree(layers);
+ goto out_err;
+ }
ofs->layers = layers;
- /* Layer 0 is reserved for upper even if there's no upper */
+ /*
+ * Layer 0 is reserved for upper even if there's no upper.
+ * For consistency, config.lowerdirs[0] is NULL.
+ */
ofs->numlayer = 1;
sb->s_stack_depth = 0;
diff --git a/fs/pipe.c b/fs/pipe.c
index 6c1a9b1db907..139190165a1c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -537,7 +537,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
break;
}
ret += copied;
- buf->offset = 0;
buf->len = copied;
if (!iov_iter_count(from))
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9dda7e54b2d0..9a8f32f21ff5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -289,9 +289,7 @@ struct proc_maps_private {
struct inode *inode;
struct task_struct *task;
struct mm_struct *mm;
-#ifdef CONFIG_MMU
struct vma_iterator iter;
-#endif
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
#endif
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index a8ac0dd8041e..7cebd397cc26 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -175,15 +175,28 @@ static int show_map(struct seq_file *m, void *_p)
return nommu_vma_show(m, _p);
}
-static void *m_start(struct seq_file *m, loff_t *pos)
+static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
+ loff_t *ppos)
+{
+ struct vm_area_struct *vma = vma_next(&priv->iter);
+
+ if (vma) {
+ *ppos = vma->vm_start;
+ } else {
+ *ppos = -1UL;
+ }
+
+ return vma;
+}
+
+static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
+ unsigned long last_addr = *ppos;
struct mm_struct *mm;
- struct vm_area_struct *vma;
- unsigned long addr = *pos;
- /* See m_next(). Zero at the start or after lseek. */
- if (addr == -1UL)
+ /* See proc_get_vma(). Zero at the start or after lseek. */
+ if (last_addr == -1UL)
return NULL;
/* pin the task and mm whilst we play with them */
@@ -192,44 +205,41 @@ static void *m_start(struct seq_file *m, loff_t *pos)
return ERR_PTR(-ESRCH);
mm = priv->mm;
- if (!mm || !mmget_not_zero(mm))
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
return NULL;
+ }
if (mmap_read_lock_killable(mm)) {
mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
return ERR_PTR(-EINTR);
}
- /* start the next element from addr */
- vma = find_vma(mm, addr);
- if (vma)
- return vma;
+ vma_iter_init(&priv->iter, mm, last_addr);
- mmap_read_unlock(mm);
- mmput(mm);
- return NULL;
+ return proc_get_vma(priv, ppos);
}
-static void m_stop(struct seq_file *m, void *_vml)
+static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->mm;
- if (!IS_ERR_OR_NULL(_vml)) {
- mmap_read_unlock(priv->mm);
- mmput(priv->mm);
- }
- if (priv->task) {
- put_task_struct(priv->task);
- priv->task = NULL;
- }
+ if (!priv->task)
+ return;
+
+ mmap_read_unlock(mm);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
}
-static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_p, loff_t *ppos)
{
- struct vm_area_struct *vma = _p;
-
- *pos = vma->vm_end;
- return find_vma(vma->vm_mm, vma->vm_end);
+ return proc_get_vma(m->private, ppos);
}
static const struct seq_operations proc_pid_maps_ops = {
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index b81749492ef9..7d12b8c5b2fa 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2699,7 +2699,7 @@ struct reiserfs_iget_args {
#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
#define journal_trans_half(blocksize) \
- ((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32))
+ ((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32))
/* journal.c see journal.c for all the comments here */
@@ -2711,7 +2711,7 @@ struct reiserfs_journal_desc {
__le32 j_len;
__le32 j_mount_id; /* mount id of this trans */
- __le32 j_realblock[1]; /* real locations for each block */
+ __le32 j_realblock[]; /* real locations for each block */
};
#define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id)
@@ -2726,7 +2726,7 @@ struct reiserfs_journal_desc {
struct reiserfs_journal_commit {
__le32 j_trans_id; /* must match j_trans_id from the desc block */
__le32 j_len; /* ditto */
- __le32 j_realblock[1]; /* real locations for each block */
+ __le32 j_realblock[]; /* real locations for each block */
};
#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index b17f067e4ada..e2be8aedb26e 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -452,6 +452,9 @@ void invalidate_all_cached_dirs(struct cifs_tcon *tcon)
struct cached_fid *cfid, *q;
LIST_HEAD(entry);
+ if (cfids == NULL)
+ return;
+
spin_lock(&cfids->cfid_list_lock);
list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
list_move(&cfid->entry, &entry);
@@ -651,6 +654,9 @@ void free_cached_dirs(struct cached_fids *cfids)
struct cached_fid *cfid, *q;
LIST_HEAD(entry);
+ if (cfids == NULL)
+ return;
+
if (cfids->laundromat) {
kthread_stop(cfids->laundromat);
cfids->laundromat = NULL;
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 032d8716f671..02082621d8e0 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1807,6 +1807,7 @@ static inline bool is_retryable_error(int error)
#define MID_RETRY_NEEDED 8 /* session closed while this request out */
#define MID_RESPONSE_MALFORMED 0x10
#define MID_SHUTDOWN 0x20
+#define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */
/* Flags */
#define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */
@@ -1943,7 +1944,7 @@ require use of the stronger protocol */
* cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once
* ->can_cache_brlcks
* cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc
- * cached_fid->fid_mutex cifs_tcon->crfid tconInfoAlloc
+ * cached_fid->fid_mutex cifs_tcon->crfid tcon_info_alloc
* cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo
* cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo
* ->invalidHandle initiate_cifs_search
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 7d8035846680..0c37eefa18a5 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -512,7 +512,7 @@ extern int CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses);
extern struct cifs_ses *sesInfoAlloc(void);
extern void sesInfoFree(struct cifs_ses *);
-extern struct cifs_tcon *tconInfoAlloc(void);
+extern struct cifs_tcon *tcon_info_alloc(bool dir_leases_enabled);
extern void tconInfoFree(struct cifs_tcon *);
extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 687754791bf0..3902e90dca6b 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -1882,7 +1882,8 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
}
}
- tcon = tconInfoAlloc();
+ /* no need to setup directory caching on IPC share, so pass in false */
+ tcon = tcon_info_alloc(false);
if (tcon == NULL)
return -ENOMEM;
@@ -2492,7 +2493,10 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
goto out_fail;
}
- tcon = tconInfoAlloc();
+ if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING)
+ tcon = tcon_info_alloc(true);
+ else
+ tcon = tcon_info_alloc(false);
if (tcon == NULL) {
rc = -ENOMEM;
goto out_fail;
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index e45ce31bbda7..a3493da12ad1 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -1541,6 +1541,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
cifs_parse_mount_err:
kfree_sensitive(ctx->password);
+ ctx->password = NULL;
return -EINVAL;
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index de2dfbaae821..d7c302442c1e 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -2680,7 +2680,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
}
cifsFileInfo_put(cfile);
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
int cifs_truncate_page(struct address_space *mapping, loff_t from)
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 366b755ca913..35b176457bbe 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -113,18 +113,22 @@ sesInfoFree(struct cifs_ses *buf_to_free)
}
struct cifs_tcon *
-tconInfoAlloc(void)
+tcon_info_alloc(bool dir_leases_enabled)
{
struct cifs_tcon *ret_buf;
ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL);
if (!ret_buf)
return NULL;
- ret_buf->cfids = init_cached_dirs();
- if (!ret_buf->cfids) {
- kfree(ret_buf);
- return NULL;
+
+ if (dir_leases_enabled == true) {
+ ret_buf->cfids = init_cached_dirs();
+ if (!ret_buf->cfids) {
+ kfree(ret_buf);
+ return NULL;
+ }
}
+ /* else ret_buf->cfids is already set to NULL above */
atomic_inc(&tconInfoAllocCount);
ret_buf->status = TID_NEW;
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index b41e2e872b22..0b89f7008ac0 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -539,6 +539,9 @@ static int parse_create_response(struct cifs_open_info_data *data,
int rc = 0;
switch (rsp->hdr.Status) {
+ case STATUS_IO_REPARSE_TAG_NOT_HANDLED:
+ reparse_point = true;
+ break;
case STATUS_STOPPED_ON_SYMLINK:
rc = smb2_parse_symlink_response(cifs_sb, iov,
&data->symlink_target);
diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c
index 194799ddd382..1a90dd78b238 100644
--- a/fs/smb/client/smb2maperror.c
+++ b/fs/smb/client/smb2maperror.c
@@ -877,8 +877,6 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
"STATUS_IO_REPARSE_TAG_MISMATCH"},
{STATUS_IO_REPARSE_DATA_INVALID, -EIO,
"STATUS_IO_REPARSE_DATA_INVALID"},
- {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EIO,
- "STATUS_IO_REPARSE_TAG_NOT_HANDLED"},
{STATUS_REPARSE_POINT_NOT_RESOLVED, -EIO,
"STATUS_REPARSE_POINT_NOT_RESOLVED"},
{STATUS_DIRECTORY_IS_A_REPARSE_POINT, -EIO,
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index d9eda2e958b4..9aeecee6b91b 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -297,7 +297,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)",
credits->value, new_val);
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
spin_lock(&server->req_lock);
@@ -1161,7 +1161,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
/* Use a fudge factor of 256 bytes in case we collide
* with a different set_EAs command.
*/
- if(CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
+ if (CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
MAX_SMB2_CLOSE_RESPONSE_SIZE - 256 <
used_len + ea_name_len + ea_value_len + 1) {
rc = -ENOSPC;
@@ -4591,7 +4591,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
if (shdr->Command != SMB2_READ) {
cifs_server_dbg(VFS, "only big read responses are supported\n");
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
if (server->ops->is_session_expired &&
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 092b0087c9dc..c75a80bb6d9e 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -89,20 +89,26 @@ smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd,
struct TCP_Server_Info *server)
{
struct smb3_hdr_req *smb3_hdr;
+
shdr->ProtocolId = SMB2_PROTO_NUMBER;
shdr->StructureSize = cpu_to_le16(64);
shdr->Command = smb2_cmd;
- if (server->dialect >= SMB30_PROT_ID) {
- /* After reconnect SMB3 must set ChannelSequence on subsequent reqs */
- smb3_hdr = (struct smb3_hdr_req *)shdr;
- /* if primary channel is not set yet, use default channel for chan sequence num */
- if (SERVER_IS_CHAN(server))
- smb3_hdr->ChannelSequence =
- cpu_to_le16(server->primary_server->channel_sequence_num);
- else
- smb3_hdr->ChannelSequence = cpu_to_le16(server->channel_sequence_num);
- }
+
if (server) {
+ /* After reconnect SMB3 must set ChannelSequence on subsequent reqs */
+ if (server->dialect >= SMB30_PROT_ID) {
+ smb3_hdr = (struct smb3_hdr_req *)shdr;
+ /*
+ * if primary channel is not set yet, use default
+ * channel for chan sequence num
+ */
+ if (SERVER_IS_CHAN(server))
+ smb3_hdr->ChannelSequence =
+ cpu_to_le16(server->primary_server->channel_sequence_num);
+ else
+ smb3_hdr->ChannelSequence =
+ cpu_to_le16(server->channel_sequence_num);
+ }
spin_lock(&server->req_lock);
/* Request up to 10 credits but don't go over the limit. */
if (server->credits >= server->max_credits)
@@ -842,7 +848,7 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
iov[num].iov_base = create_posix_buf(mode);
if (mode == ACL_NO_MODE)
- cifs_dbg(FYI, "Invalid mode\n");
+ cifs_dbg(FYI, "%s: no mode\n", __func__);
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct create_posix);
@@ -2234,7 +2240,7 @@ create_durable_v2_buf(struct cifs_open_parms *oparms)
* (most servers default to 120 seconds) and most clients default to 0.
* This can be overridden at mount ("handletimeout=") if the user wants
* a different persistent (or resilient) handle timeout for all opens
- * opens on a particular SMB3 mount.
+ * on a particular SMB3 mount.
*/
buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout);
buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
@@ -2379,7 +2385,7 @@ add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp)
return 0;
}
-/* See See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */
+/* See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */
static void setup_owner_group_sids(char *buf)
{
struct owner_group_sids *sids = (struct owner_group_sids *)buf;
@@ -3124,6 +3130,7 @@ void
SMB2_ioctl_free(struct smb_rqst *rqst)
{
int i;
+
if (rqst && rqst->rq_iov) {
cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
for (i = 1; i < rqst->rq_nvec; i++)
@@ -3871,7 +3878,7 @@ void smb2_reconnect_server(struct work_struct *work)
goto done;
/* allocate a dummy tcon struct used for reconnect */
- tcon = tconInfoAlloc();
+ tcon = tcon_info_alloc(false);
if (!tcon) {
resched = true;
list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) {
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 2a2aec8c6112..94df9eec3d8d 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -1401,10 +1401,13 @@ create_conn:
server->smbd_conn = smbd_get_connection(
server, (struct sockaddr *) &server->dstaddr);
- if (server->smbd_conn)
+ if (server->smbd_conn) {
cifs_dbg(VFS, "RDMA transport re-established\n");
-
- return server->smbd_conn ? 0 : -ENOENT;
+ trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr);
+ return 0;
+ }
+ trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr);
+ return -ENOENT;
}
static void destroy_caches_and_workqueue(struct smbd_connection *info)
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index a7e4755bed0f..de199ec9f726 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -935,6 +935,8 @@ DEFINE_EVENT(smb3_connect_class, smb3_##name, \
TP_ARGS(hostname, conn_id, addr))
DEFINE_SMB3_CONNECT_EVENT(connect_done);
+DEFINE_SMB3_CONNECT_EVENT(smbd_connect_done);
+DEFINE_SMB3_CONNECT_EVENT(smbd_connect_err);
DECLARE_EVENT_CLASS(smb3_connect_err_class,
TP_PROTO(char *hostname, __u64 conn_id,
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 1b5d9794ed5b..14710afdc2a3 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -18,7 +18,7 @@
#include <linux/bvec.h>
#include <linux/highmem.h>
#include <linux/uaccess.h>
-#include <asm/processor.h>
+#include <linux/processor.h>
#include <linux/mempool.h>
#include <linux/sched/signal.h>
#include <linux/task_io_accounting_ops.h>
@@ -35,6 +35,8 @@
void
cifs_wake_up_task(struct mid_q_entry *mid)
{
+ if (mid->mid_state == MID_RESPONSE_RECEIVED)
+ mid->mid_state = MID_RESPONSE_READY;
wake_up_process(mid->callback_data);
}
@@ -87,7 +89,8 @@ static void __release_mid(struct kref *refcount)
struct TCP_Server_Info *server = midEntry->server;
if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) &&
- midEntry->mid_state == MID_RESPONSE_RECEIVED &&
+ (midEntry->mid_state == MID_RESPONSE_RECEIVED ||
+ midEntry->mid_state == MID_RESPONSE_READY) &&
server->ops->handle_cancelled_mid)
server->ops->handle_cancelled_mid(midEntry, server);
@@ -737,7 +740,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
int error;
error = wait_event_state(server->response_q,
- midQ->mid_state != MID_REQUEST_SUBMITTED,
+ midQ->mid_state != MID_REQUEST_SUBMITTED &&
+ midQ->mid_state != MID_RESPONSE_RECEIVED,
(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE));
if (error < 0)
return -ERESTARTSYS;
@@ -890,7 +894,7 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
spin_lock(&server->mid_lock);
switch (mid->mid_state) {
- case MID_RESPONSE_RECEIVED:
+ case MID_RESPONSE_READY:
spin_unlock(&server->mid_lock);
return rc;
case MID_RETRY_NEEDED:
@@ -989,6 +993,9 @@ cifs_compound_callback(struct mid_q_entry *mid)
credits.instance = server->reconnect_instance;
add_credits(server, &credits, mid->optype);
+
+ if (mid->mid_state == MID_RESPONSE_RECEIVED)
+ mid->mid_state = MID_RESPONSE_READY;
}
static void
@@ -1209,7 +1216,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
send_cancel(server, &rqst[i], midQ[i]);
spin_lock(&server->mid_lock);
midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
- if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
+ if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ[i]->mid_state == MID_RESPONSE_RECEIVED) {
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
credits[i].value = 0;
@@ -1230,7 +1238,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
}
if (!midQ[i]->resp_buf ||
- midQ[i]->mid_state != MID_RESPONSE_RECEIVED) {
+ midQ[i]->mid_state != MID_RESPONSE_READY) {
rc = -EIO;
cifs_dbg(FYI, "Bad MID state?\n");
goto out;
@@ -1417,7 +1425,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc != 0) {
send_cancel(server, &rqst, midQ);
spin_lock(&server->mid_lock);
- if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
+ if (midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED) {
/* no longer considered to be "in-flight" */
midQ->callback = release_mid;
spin_unlock(&server->mid_lock);
@@ -1434,7 +1443,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
}
if (!midQ->resp_buf || !out_buf ||
- midQ->mid_state != MID_RESPONSE_RECEIVED) {
+ midQ->mid_state != MID_RESPONSE_READY) {
rc = -EIO;
cifs_server_dbg(VFS, "Bad MID state?\n");
goto out;
@@ -1558,14 +1567,16 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
/* Wait for a reply - allow signals to interrupt. */
rc = wait_event_interruptible(server->response_q,
- (!(midQ->mid_state == MID_REQUEST_SUBMITTED)) ||
+ (!(midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED)) ||
((server->tcpStatus != CifsGood) &&
(server->tcpStatus != CifsNew)));
/* Were we interrupted by a signal ? */
spin_lock(&server->srv_lock);
if ((rc == -ERESTARTSYS) &&
- (midQ->mid_state == MID_REQUEST_SUBMITTED) &&
+ (midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED) &&
((server->tcpStatus == CifsGood) ||
(server->tcpStatus == CifsNew))) {
spin_unlock(&server->srv_lock);
@@ -1596,7 +1607,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
send_cancel(server, &rqst, midQ);
spin_lock(&server->mid_lock);
- if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
+ if (midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED) {
/* no longer considered to be "in-flight" */
midQ->callback = release_mid;
spin_unlock(&server->mid_lock);
@@ -1616,7 +1628,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
/* rcvd frame is ok */
- if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) {
+ if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_READY) {
rc = -EIO;
cifs_tcon_dbg(VFS, "Bad MID state?\n");
goto out;
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 0d990c2f33cd..db7fa704a3f6 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -197,6 +197,9 @@ int ksmbd_conn_write(struct ksmbd_work *work)
if (work->send_no_response)
return 0;
+ if (!work->iov_idx)
+ return -EINVAL;
+
ksmbd_conn_lock(conn);
sent = conn->transport->ops->writev(conn->transport, work->iov,
work->iov_cnt,
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 5ab2f52f9b35..32347fec33c4 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -115,8 +115,10 @@ static int __process_request(struct ksmbd_work *work, struct ksmbd_conn *conn,
if (check_conn_state(work))
return SERVER_HANDLER_CONTINUE;
- if (ksmbd_verify_smb_message(work))
+ if (ksmbd_verify_smb_message(work)) {
+ conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
return SERVER_HANDLER_ABORT;
+ }
command = conn->ops->get_cmd_val(work);
*cmd = command;
diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c
index e881df1d10cb..23bd3d1209df 100644
--- a/fs/smb/server/smb2misc.c
+++ b/fs/smb/server/smb2misc.c
@@ -440,10 +440,8 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
validate_credit:
if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) &&
- smb2_validate_credit_charge(work->conn, hdr)) {
- work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
+ smb2_validate_credit_charge(work->conn, hdr))
return 1;
- }
return 0;
}
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 749660110878..544022dd6d20 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -6312,7 +6312,7 @@ int smb2_read(struct ksmbd_work *work)
aux_payload_buf,
nbytes);
kvfree(aux_payload_buf);
-
+ aux_payload_buf = NULL;
nbytes = 0;
if (remain_bytes < 0) {
err = (int)remain_bytes;
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index e5e438bf5499..6c0305be895e 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -1420,7 +1420,6 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
out:
posix_acl_release(fattr.cf_acls);
posix_acl_release(fattr.cf_dacls);
- mark_inode_dirty(inode);
return rc;
}
diff --git a/fs/stat.c b/fs/stat.c
index 6822ac77aec2..d43a5cc1bfa4 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -27,37 +27,6 @@
#include "mount.h"
/**
- * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
- * @stat: where to store the resulting values
- * @request_mask: STATX_* values requested
- * @inode: inode from which to grab the c/mtime
- *
- * Given @inode, grab the ctime and mtime out if it and store the result
- * in @stat. When fetching the value, flag it as queried so the next write
- * will use a fine-grained timestamp.
- */
-void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
-{
- atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec;
-
- /* If neither time was requested, then don't report them */
- if (!(request_mask & (STATX_CTIME|STATX_MTIME))) {
- stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
- return;
- }
-
- stat->mtime = inode->i_mtime;
- stat->ctime.tv_sec = inode->__i_ctime.tv_sec;
- /*
- * Atomically set the QUERIED flag and fetch the new value with
- * the flag masked off.
- */
- stat->ctime.tv_nsec = atomic_long_fetch_or(I_CTIME_QUERIED, pnsec) &
- ~I_CTIME_QUERIED;
-}
-EXPORT_SYMBOL(fill_mg_cmtime);
-
-/**
* generic_fillattr - Fill in the basic attributes from the inode struct
* @idmap: idmap of the mount the inode was found from
* @request_mask: statx request_mask
@@ -89,14 +58,8 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
stat->rdev = inode->i_rdev;
stat->size = i_size_read(inode);
stat->atime = inode->i_atime;
-
- if (is_mgtime(inode)) {
- fill_mg_cmtime(stat, request_mask, inode);
- } else {
- stat->mtime = inode->i_mtime;
- stat->ctime = inode_get_ctime(inode);
- }
-
+ stat->mtime = inode->i_mtime;
+ stat->ctime = inode_get_ctime(inode);
stat->blksize = i_blocksize(inode);
stat->blocks = inode->i_blocks;
@@ -419,12 +382,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
#ifdef __ARCH_WANT_NEW_STAT
-#if BITS_PER_LONG == 32
-# define choose_32_64(a,b) a
-#else
-# define choose_32_64(a,b) b
-#endif
-
#ifndef INIT_STRUCT_STAT_PADDING
# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 237c6f370ad9..8c8d64e76103 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -70,6 +70,7 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags);
static int dcache_dir_open_wrapper(struct inode *inode, struct file *file);
+static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx);
static int eventfs_release(struct inode *inode, struct file *file);
static const struct inode_operations eventfs_root_dir_inode_operations = {
@@ -79,7 +80,7 @@ static const struct inode_operations eventfs_root_dir_inode_operations = {
static const struct file_operations eventfs_file_operations = {
.open = dcache_dir_open_wrapper,
.read = generic_read_dir,
- .iterate_shared = dcache_readdir,
+ .iterate_shared = dcache_readdir_wrapper,
.llseek = generic_file_llseek,
.release = eventfs_release,
};
@@ -185,17 +186,49 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
/**
* eventfs_set_ef_status_free - set the ef->status to free
+ * @ti: the tracefs_inode of the dentry
* @dentry: dentry who's status to be freed
*
* eventfs_set_ef_status_free will be called if no more
* references remain
*/
-void eventfs_set_ef_status_free(struct dentry *dentry)
+void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
{
struct tracefs_inode *ti_parent;
- struct eventfs_file *ef;
+ struct eventfs_inode *ei;
+ struct eventfs_file *ef, *tmp;
+
+ /* The top level events directory may be freed by this */
+ if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
+ LIST_HEAD(ef_del_list);
+
+ mutex_lock(&eventfs_mutex);
+
+ ei = ti->private;
+
+ /* Record all the top level files */
+ list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+ lockdep_is_held(&eventfs_mutex)) {
+ list_add_tail(&ef->del_list, &ef_del_list);
+ }
+
+ /* Nothing should access this, but just in case! */
+ ti->private = NULL;
+
+ mutex_unlock(&eventfs_mutex);
+
+ /* Now safely free the top level files and their children */
+ list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
+ list_del(&ef->del_list);
+ eventfs_remove(ef);
+ }
+
+ kfree(ei);
+ return;
+ }
mutex_lock(&eventfs_mutex);
+
ti_parent = get_tracefs(dentry->d_parent->d_inode);
if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
goto out;
@@ -364,6 +397,11 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
return ret;
}
+struct dentry_list {
+ void *cursor;
+ struct dentry **dentries;
+};
+
/**
* eventfs_release - called to release eventfs file/dir
* @inode: inode to be released
@@ -372,26 +410,25 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
static int eventfs_release(struct inode *inode, struct file *file)
{
struct tracefs_inode *ti;
- struct eventfs_inode *ei;
- struct eventfs_file *ef;
- struct dentry *dentry;
- int idx;
+ struct dentry_list *dlist = file->private_data;
+ void *cursor;
+ int i;
ti = get_tracefs(inode);
if (!(ti->flags & TRACEFS_EVENT_INODE))
return -EINVAL;
- ei = ti->private;
- idx = srcu_read_lock(&eventfs_srcu);
- list_for_each_entry_srcu(ef, &ei->e_top_files, list,
- srcu_read_lock_held(&eventfs_srcu)) {
- mutex_lock(&eventfs_mutex);
- dentry = ef->dentry;
- mutex_unlock(&eventfs_mutex);
- if (dentry)
- dput(dentry);
+ if (WARN_ON_ONCE(!dlist))
+ return -EINVAL;
+
+ for (i = 0; dlist->dentries && dlist->dentries[i]; i++) {
+ dput(dlist->dentries[i]);
}
- srcu_read_unlock(&eventfs_srcu, idx);
+
+ cursor = dlist->cursor;
+ kfree(dlist->dentries);
+ kfree(dlist);
+ file->private_data = cursor;
return dcache_dir_close(inode, file);
}
@@ -410,21 +447,70 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
struct tracefs_inode *ti;
struct eventfs_inode *ei;
struct eventfs_file *ef;
+ struct dentry_list *dlist;
+ struct dentry **dentries = NULL;
struct dentry *dentry = file_dentry(file);
+ struct dentry *d;
struct inode *f_inode = file_inode(file);
+ int cnt = 0;
int idx;
+ int ret;
ti = get_tracefs(f_inode);
if (!(ti->flags & TRACEFS_EVENT_INODE))
return -EINVAL;
+ if (WARN_ON_ONCE(file->private_data))
+ return -EINVAL;
+
+ dlist = kmalloc(sizeof(*dlist), GFP_KERNEL);
+ if (!dlist)
+ return -ENOMEM;
+
ei = ti->private;
idx = srcu_read_lock(&eventfs_srcu);
- list_for_each_entry_rcu(ef, &ei->e_top_files, list) {
- create_dentry(ef, dentry, false);
+ list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+ srcu_read_lock_held(&eventfs_srcu)) {
+ d = create_dentry(ef, dentry, false);
+ if (d) {
+ struct dentry **tmp;
+
+ tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL);
+ if (!tmp)
+ break;
+ tmp[cnt] = d;
+ tmp[cnt + 1] = NULL;
+ cnt++;
+ dentries = tmp;
+ }
}
srcu_read_unlock(&eventfs_srcu, idx);
- return dcache_dir_open(inode, file);
+ ret = dcache_dir_open(inode, file);
+
+ /*
+ * dcache_dir_open() sets file->private_data to a dentry cursor.
+ * Need to save that but also save all the dentries that were
+ * opened by this function.
+ */
+ dlist->cursor = file->private_data;
+ dlist->dentries = dentries;
+ file->private_data = dlist;
+ return ret;
+}
+
+/*
+ * This just sets the file->private_data back to the cursor and back.
+ */
+static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx)
+{
+ struct dentry_list *dlist = file->private_data;
+ int ret;
+
+ file->private_data = dlist->cursor;
+ ret = dcache_readdir(file, ctx);
+ dlist->cursor = file->private_data;
+ file->private_data = dlist;
+ return ret;
}
/**
@@ -491,6 +577,9 @@ struct dentry *eventfs_create_events_dir(const char *name,
struct tracefs_inode *ti;
struct inode *inode;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
if (IS_ERR(dentry))
return dentry;
@@ -507,7 +596,7 @@ struct dentry *eventfs_create_events_dir(const char *name,
INIT_LIST_HEAD(&ei->e_top_files);
ti = get_tracefs(inode);
- ti->flags |= TRACEFS_EVENT_INODE;
+ ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE;
ti->private = ei;
inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
@@ -538,6 +627,9 @@ struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
struct eventfs_inode *ei_parent;
struct eventfs_file *ef;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
if (!parent)
return ERR_PTR(-EINVAL);
@@ -569,6 +661,9 @@ struct eventfs_file *eventfs_add_dir(const char *name,
{
struct eventfs_file *ef;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
if (!ef_parent)
return ERR_PTR(-EINVAL);
@@ -606,6 +701,9 @@ int eventfs_add_events_file(const char *name, umode_t mode,
struct eventfs_inode *ei;
struct eventfs_file *ef;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return -ENODEV;
+
if (!parent)
return -EINVAL;
@@ -654,6 +752,9 @@ int eventfs_add_file(const char *name, umode_t mode,
{
struct eventfs_file *ef;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return -ENODEV;
+
if (!ef_parent)
return -EINVAL;
@@ -791,7 +892,6 @@ void eventfs_remove(struct eventfs_file *ef)
void eventfs_remove_events_dir(struct dentry *dentry)
{
struct tracefs_inode *ti;
- struct eventfs_inode *ei;
if (!dentry || !dentry->d_inode)
return;
@@ -800,8 +900,6 @@ void eventfs_remove_events_dir(struct dentry *dentry)
if (!ti || !(ti->flags & TRACEFS_EVENT_INODE))
return;
- ei = ti->private;
d_invalidate(dentry);
dput(dentry);
- kfree(ei);
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index de5b72216b1a..891653ba9cf3 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -385,7 +385,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
ti = get_tracefs(inode);
if (ti && ti->flags & TRACEFS_EVENT_INODE)
- eventfs_set_ef_status_free(dentry);
+ eventfs_set_ef_status_free(ti, dentry);
iput(inode);
}
@@ -673,6 +673,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
*/
struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
{
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
return __create_dir(name, parent, &simple_dir_inode_operations);
}
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 69c2b1d87c46..4f2e49e2197b 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -3,7 +3,8 @@
#define _TRACEFS_INTERNAL_H
enum {
- TRACEFS_EVENT_INODE = BIT(1),
+ TRACEFS_EVENT_INODE = BIT(1),
+ TRACEFS_EVENT_TOP_INODE = BIT(2),
};
struct tracefs_inode {
@@ -24,6 +25,6 @@ struct inode *tracefs_get_inode(struct super_block *sb);
struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
struct dentry *eventfs_failed_creating(struct dentry *dentry);
struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_set_ef_status_free(struct dentry *dentry);
+void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry);
#endif /* _TRACEFS_INTERNAL_H */
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index c9d653168ad0..ed0bc8cbc703 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -147,7 +147,7 @@ config XFS_ONLINE_SCRUB_STATS
bool "XFS online metadata check usage data collection"
default y
depends on XFS_ONLINE_SCRUB
- select FS_DEBUG
+ select XFS_DEBUG
help
If you say Y here, the kernel will gather usage data about
the online metadata check subsystem. This includes the number
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 2420865f3007..a5100a11faf9 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -131,4 +131,26 @@ void xlog_check_buf_cancel_table(struct xlog *log);
#define xlog_check_buf_cancel_table(log) do { } while (0)
#endif
+/*
+ * Transform a regular reservation into one suitable for recovery of a log
+ * intent item.
+ *
+ * Intent recovery only runs a single step of the transaction chain and defers
+ * the rest to a separate transaction. Therefore, we reduce logcount to 1 here
+ * to avoid livelocks if the log grant space is nearly exhausted due to the
+ * recovered intent pinning the tail. Keep the same logflags to avoid tripping
+ * asserts elsewhere. Struct copies abound below.
+ */
+static inline struct xfs_trans_res
+xlog_recover_resv(const struct xfs_trans_res *r)
+{
+ struct xfs_trans_res ret = {
+ .tr_logres = r->tr_logres,
+ .tr_logcount = 1,
+ .tr_logflags = r->tr_logflags,
+ };
+
+ return ret;
+}
+
#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 5e174685a77c..6264daaab37b 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -266,7 +266,8 @@ xfs_validate_sb_write(
return -EFSCORRUPTED;
}
- if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ if (!xfs_is_readonly(mp) &&
+ xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
xfs_alert(mp,
"Corruption detected in superblock read-only compatible features (0x%x)!",
(sbp->sb_features_ro_compat &
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index ad22656376d3..6b2296ff248a 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -62,12 +62,12 @@ xfs_trans_ichgtime(
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- /* If the mtime changes, then ctime must also change */
- ASSERT(flags & XFS_ICHGTIME_CHG);
+ tv = current_time(inode);
- tv = inode_set_ctime_current(inode);
if (flags & XFS_ICHGTIME_MOD)
inode->i_mtime = tv;
+ if (flags & XFS_ICHGTIME_CHG)
+ inode_set_ctime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_CREATE)
ip->i_crtime = tv;
}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 7d3aa14d81b5..4849efcaa33a 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -588,6 +588,8 @@ out_nofix:
out_teardown:
error = xchk_teardown(sc, error);
out_sc:
+ if (error != -ENOENT)
+ xchk_stats_merge(mp, sm, &run);
kfree(sc);
out:
trace_xchk_done(XFS_I(file_inode(file)), sm, error);
@@ -595,8 +597,6 @@ out:
sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
error = 0;
}
- if (error != -ENOENT)
- xchk_stats_merge(mp, sm, &run);
return error;
need_drain:
error = xchk_teardown(sc, 0);
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index aeb92624176b..cd91db4a5548 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -185,7 +185,10 @@ xchk_stats_merge_one(
{
struct xchk_scrub_stats *css;
- ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR);
+ if (sm->sm_type >= XFS_SCRUB_TYPE_NR) {
+ ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR);
+ return;
+ }
css = &cs->cs_stats[sm->sm_type];
spin_lock(&css->css_lock);
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 5db87b34fb6e..89c7a9f4f930 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -333,7 +333,6 @@ xfs_attr_inactive(
int error = 0;
mp = dp->i_mount;
- ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
xfs_ilock(dp, lock_mode);
if (!xfs_inode_has_attr_fork(dp))
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 2788a6f2edcd..36fe2abb16e6 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -547,7 +547,7 @@ xfs_attri_item_recover(
struct xfs_inode *ip;
struct xfs_da_args *args;
struct xfs_trans *tp;
- struct xfs_trans_res tres;
+ struct xfs_trans_res resv;
struct xfs_attri_log_format *attrp;
struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
int error;
@@ -618,8 +618,9 @@ xfs_attri_item_recover(
goto out;
}
- xfs_init_attr_trans(args, &tres, &total);
- error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp);
+ xfs_init_attr_trans(args, &resv, &total);
+ resv = xlog_recover_resv(&resv);
+ error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp);
if (error)
goto out;
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 7551c3ec4ea5..e736a0844c89 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -490,6 +490,7 @@ xfs_bui_item_recover(
struct list_head *capture_list)
{
struct xfs_bmap_intent fake = { };
+ struct xfs_trans_res resv;
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
@@ -515,7 +516,8 @@ xfs_bui_item_recover(
return error;
/* Allocate transaction and do the work. */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv,
XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
if (error)
goto err_rele;
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 1064c2342876..7cd09c3a82cb 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -146,6 +146,20 @@ xfs_nfs_get_inode(
return ERR_PTR(error);
}
+ /*
+ * Reload the incore unlinked list to avoid failure in inodegc.
+ * Use an unlocked check here because unrecovered unlinked inodes
+ * should be somewhat rare.
+ */
+ if (xfs_inode_unlinked_incomplete(ip)) {
+ error = xfs_inode_reload_unlinked(ip);
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_irele(ip);
+ return ERR_PTR(error);
+ }
+ }
+
if (VFS_I(ip)->i_generation != generation) {
xfs_irele(ip);
return ERR_PTR(-ESTALE);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index f1a5ecf099aa..3fa8789820ad 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -660,6 +660,7 @@ xfs_efi_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
+ struct xfs_trans_res resv;
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_efd_log_item *efdp;
@@ -683,7 +684,8 @@ xfs_efi_item_recover(
}
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp);
if (error)
return error;
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 10403ba9b58f..736e5545f584 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -565,6 +565,19 @@ err:
}
#endif /* CONFIG_XFS_RT */
+static inline bool
+rmap_not_shareable(struct xfs_mount *mp, const struct xfs_rmap_irec *r)
+{
+ if (!xfs_has_reflink(mp))
+ return true;
+ if (XFS_RMAP_NON_INODE_OWNER(r->rm_owner))
+ return true;
+ if (r->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+ XFS_RMAP_UNWRITTEN))
+ return true;
+ return false;
+}
+
/* Execute a getfsmap query against the regular data device. */
STATIC int
__xfs_getfsmap_datadev(
@@ -598,7 +611,6 @@ __xfs_getfsmap_datadev(
* low to the fsmap low key and max out the high key to the end
* of the AG.
*/
- info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
if (error)
@@ -608,12 +620,9 @@ __xfs_getfsmap_datadev(
/* Adjust the low key if we are continuing from where we left off. */
if (info->low.rm_blockcount == 0) {
- /* empty */
- } else if (XFS_RMAP_NON_INODE_OWNER(info->low.rm_owner) ||
- (info->low.rm_flags & (XFS_RMAP_ATTR_FORK |
- XFS_RMAP_BMBT_BLOCK |
- XFS_RMAP_UNWRITTEN))) {
- info->low.rm_startblock += info->low.rm_blockcount;
+ /* No previous record from which to continue */
+ } else if (rmap_not_shareable(mp, &info->low)) {
+ /* Last record seen was an unshareable extent */
info->low.rm_owner = 0;
info->low.rm_offset = 0;
@@ -621,8 +630,10 @@ __xfs_getfsmap_datadev(
if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs)
return 0;
} else {
+ /* Last record seen was a shareable file data extent */
info->low.rm_offset += info->low.rm_blockcount;
}
+ info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
info->high.rm_startblock = -1U;
info->high.rm_owner = ULLONG_MAX;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index e541f5c0bc25..3c210ac83713 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -113,7 +113,7 @@ xfs_inode_alloc(
INIT_LIST_HEAD(&ip->i_ioend_list);
spin_lock_init(&ip->i_ioend_lock);
ip->i_next_unlinked = NULLAGINO;
- ip->i_prev_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = 0;
return ip;
}
@@ -443,7 +443,7 @@ xfs_inodegc_queue_all(
int cpu;
bool ret = false;
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list)) {
mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
@@ -463,7 +463,7 @@ xfs_inodegc_wait_all(
int error = 0;
flush_workqueue(mp->m_inodegc_wq);
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
struct xfs_inodegc *gc;
gc = per_cpu_ptr(mp->m_inodegc, cpu);
@@ -1845,9 +1845,17 @@ xfs_inodegc_worker(
struct xfs_inodegc, work);
struct llist_node *node = llist_del_all(&gc->list);
struct xfs_inode *ip, *n;
+ struct xfs_mount *mp = gc->mp;
unsigned int nofs_flag;
- ASSERT(gc->cpu == smp_processor_id());
+ /*
+ * Clear the cpu mask bit and ensure that we have seen the latest
+ * update of the gc structure associated with this CPU. This matches
+ * with the release semantics used when setting the cpumask bit in
+ * xfs_inodegc_queue.
+ */
+ cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask);
+ smp_mb__after_atomic();
WRITE_ONCE(gc->items, 0);
@@ -1862,7 +1870,7 @@ xfs_inodegc_worker(
nofs_flag = memalloc_nofs_save();
ip = llist_entry(node, struct xfs_inode, i_gclist);
- trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
+ trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));
WRITE_ONCE(gc->shrinker_hits, 0);
llist_for_each_entry_safe(ip, n, node, i_gclist) {
@@ -2057,6 +2065,7 @@ xfs_inodegc_queue(
struct xfs_inodegc *gc;
int items;
unsigned int shrinker_hits;
+ unsigned int cpu_nr;
unsigned long queue_delay = 1;
trace_xfs_inode_set_need_inactive(ip);
@@ -2064,18 +2073,28 @@ xfs_inodegc_queue(
ip->i_flags |= XFS_NEED_INACTIVE;
spin_unlock(&ip->i_flags_lock);
- gc = get_cpu_ptr(mp->m_inodegc);
+ cpu_nr = get_cpu();
+ gc = this_cpu_ptr(mp->m_inodegc);
llist_add(&ip->i_gclist, &gc->list);
items = READ_ONCE(gc->items);
WRITE_ONCE(gc->items, items + 1);
shrinker_hits = READ_ONCE(gc->shrinker_hits);
/*
+ * Ensure the list add is always seen by anyone who finds the cpumask
+ * bit set. This effectively gives the cpumask bit set operation
+ * release ordering semantics.
+ */
+ smp_mb__before_atomic();
+ if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask))
+ cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask);
+
+ /*
* We queue the work while holding the current CPU so that the work
* is scheduled to run on this CPU.
*/
if (!xfs_is_inodegc_enabled(mp)) {
- put_cpu_ptr(gc);
+ put_cpu();
return;
}
@@ -2085,7 +2104,7 @@ xfs_inodegc_queue(
trace_xfs_inodegc_queue(mp, __return_address);
mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
queue_delay);
- put_cpu_ptr(gc);
+ put_cpu();
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
trace_xfs_inodegc_throttle(mp, __return_address);
@@ -2094,47 +2113,6 @@ xfs_inodegc_queue(
}
/*
- * Fold the dead CPU inodegc queue into the current CPUs queue.
- */
-void
-xfs_inodegc_cpu_dead(
- struct xfs_mount *mp,
- unsigned int dead_cpu)
-{
- struct xfs_inodegc *dead_gc, *gc;
- struct llist_node *first, *last;
- unsigned int count = 0;
-
- dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
- cancel_delayed_work_sync(&dead_gc->work);
-
- if (llist_empty(&dead_gc->list))
- return;
-
- first = dead_gc->list.first;
- last = first;
- while (last->next) {
- last = last->next;
- count++;
- }
- dead_gc->list.first = NULL;
- dead_gc->items = 0;
-
- /* Add pending work to current CPU */
- gc = get_cpu_ptr(mp->m_inodegc);
- llist_add_batch(first, last, &gc->list);
- count += READ_ONCE(gc->items);
- WRITE_ONCE(gc->items, count);
-
- if (xfs_is_inodegc_enabled(mp)) {
- trace_xfs_inodegc_queue(mp, __return_address);
- mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
- 0);
- }
- put_cpu_ptr(gc);
-}
-
-/*
* We set the inode flag atomically with the radix tree tag. Once we get tag
* lookups on the radix tree, this inode flag can go away.
*
@@ -2195,7 +2173,7 @@ xfs_inodegc_shrinker_count(
if (!xfs_is_inodegc_enabled(mp))
return 0;
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list))
return XFS_INODEGC_SHRINKER_COUNT;
@@ -2220,7 +2198,7 @@ xfs_inodegc_shrinker_scan(
trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list)) {
unsigned int h = READ_ONCE(gc->shrinker_hits);
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 2fa6f2e09d07..905944dafbe5 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -79,7 +79,6 @@ void xfs_inodegc_push(struct xfs_mount *mp);
int xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
-void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
int xfs_inodegc_register_shrinker(struct xfs_mount *mp);
#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 360fe83a334f..4d55f58d99b7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1642,8 +1642,11 @@ xfs_inode_needs_inactive(
if (VFS_I(ip)->i_mode == 0)
return false;
- /* If this is a read-only mount, don't do this (would generate I/O) */
- if (xfs_is_readonly(mp))
+ /*
+ * If this is a read-only mount, don't do this (would generate I/O)
+ * unless we're in log recovery and cleaning the iunlinked list.
+ */
+ if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
return false;
/* If the log isn't running, push inodes straight to reclaim. */
@@ -1703,8 +1706,11 @@ xfs_inactive(
mp = ip->i_mount;
ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
- /* If this is a read-only mount, don't do this (would generate I/O) */
- if (xfs_is_readonly(mp))
+ /*
+ * If this is a read-only mount, don't do this (would generate I/O)
+ * unless we're in log recovery and cleaning the iunlinked list.
+ */
+ if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
goto out;
/* Metadata inodes require explicit resource cleanup. */
@@ -1736,9 +1742,21 @@ xfs_inactive(
ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
truncate = 1;
- error = xfs_qm_dqattach(ip);
- if (error)
- goto out;
+ if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) {
+ /*
+ * If this inode is being inactivated during a quotacheck and
+ * has not yet been scanned by quotacheck, we /must/ remove
+ * the dquots from the inode before inactivation changes the
+ * block and inode counts. Most probably this is a result of
+ * reloading the incore iunlinked list to purge unrecovered
+ * unlinked inodes.
+ */
+ xfs_qm_dqdetach(ip);
+ } else {
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ goto out;
+ }
if (S_ISLNK(VFS_I(ip)->i_mode))
error = xfs_inactive_symlink(ip);
@@ -1822,12 +1840,17 @@ xfs_iunlink_lookup(
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+ if (!ip) {
+ /* Caller can handle inode not being in memory. */
+ rcu_read_unlock();
+ return NULL;
+ }
/*
- * Inode not in memory or in RCU freeing limbo should not happen.
- * Warn about this and let the caller handle the failure.
+ * Inode in RCU freeing limbo should not happen. Warn about this and
+ * let the caller handle the failure.
*/
- if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
+ if (WARN_ON_ONCE(!ip->i_ino)) {
rcu_read_unlock();
return NULL;
}
@@ -1836,7 +1859,10 @@ xfs_iunlink_lookup(
return ip;
}
-/* Update the prev pointer of the next agino. */
+/*
+ * Update the prev pointer of the next agino. Returns -ENOLINK if the inode
+ * is not in cache.
+ */
static int
xfs_iunlink_update_backref(
struct xfs_perag *pag,
@@ -1851,7 +1877,8 @@ xfs_iunlink_update_backref(
ip = xfs_iunlink_lookup(pag, next_agino);
if (!ip)
- return -EFSCORRUPTED;
+ return -ENOLINK;
+
ip->i_prev_unlinked = prev_agino;
return 0;
}
@@ -1895,6 +1922,64 @@ xfs_iunlink_update_bucket(
return 0;
}
+/*
+ * Load the inode @next_agino into the cache and set its prev_unlinked pointer
+ * to @prev_agino. Caller must hold the AGI to synchronize with other changes
+ * to the unlinked list.
+ */
+STATIC int
+xfs_iunlink_reload_next(
+ struct xfs_trans *tp,
+ struct xfs_buf *agibp,
+ xfs_agino_t prev_agino,
+ xfs_agino_t next_agino)
+{
+ struct xfs_perag *pag = agibp->b_pag;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_inode *next_ip = NULL;
+ xfs_ino_t ino;
+ int error;
+
+ ASSERT(next_agino != NULLAGINO);
+
+#ifdef DEBUG
+ rcu_read_lock();
+ next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
+ ASSERT(next_ip == NULL);
+ rcu_read_unlock();
+#endif
+
+ xfs_info_ratelimited(mp,
+ "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
+ next_agino, pag->pag_agno);
+
+ /*
+ * Use an untrusted lookup just to be cautious in case the AGI has been
+ * corrupted and now points at a free inode. That shouldn't happen,
+ * but we'd rather shut down now since we're already running in a weird
+ * situation.
+ */
+ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
+ error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
+ if (error)
+ return error;
+
+ /* If this is not an unlinked inode, something is very wrong. */
+ if (VFS_I(next_ip)->i_nlink != 0) {
+ error = -EFSCORRUPTED;
+ goto rele;
+ }
+
+ next_ip->i_prev_unlinked = prev_agino;
+ trace_xfs_iunlink_reload_next(next_ip);
+rele:
+ ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
+ if (xfs_is_quotacheck_running(mp) && next_ip)
+ xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED);
+ xfs_irele(next_ip);
+ return error;
+}
+
static int
xfs_iunlink_insert_inode(
struct xfs_trans *tp,
@@ -1926,6 +2011,8 @@ xfs_iunlink_insert_inode(
* inode.
*/
error = xfs_iunlink_update_backref(pag, agino, next_agino);
+ if (error == -ENOLINK)
+ error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
if (error)
return error;
@@ -1941,6 +2028,7 @@ xfs_iunlink_insert_inode(
}
/* Point the head of the list to point to this inode. */
+ ip->i_prev_unlinked = NULLAGINO;
return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
}
@@ -2020,6 +2108,9 @@ xfs_iunlink_remove_inode(
*/
error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
ip->i_next_unlinked);
+ if (error == -ENOLINK)
+ error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
+ ip->i_next_unlinked);
if (error)
return error;
@@ -2040,7 +2131,7 @@ xfs_iunlink_remove_inode(
}
ip->i_next_unlinked = NULLAGINO;
- ip->i_prev_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = 0;
return error;
}
@@ -3529,3 +3620,117 @@ xfs_iunlock2_io_mmap(
if (ip1 != ip2)
inode_unlock(VFS_I(ip1));
}
+
+/*
+ * Reload the incore inode list for this inode. Caller should ensure that
+ * the link count cannot change, either by taking ILOCK_SHARED or otherwise
+ * preventing other threads from executing.
+ */
+int
+xfs_inode_reload_unlinked_bucket(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *agibp;
+ struct xfs_agi *agi;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ xfs_agino_t prev_agino, next_agino;
+ unsigned int bucket;
+ bool foundit = false;
+ int error;
+
+ /* Grab the first inode in the list */
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_ialloc_read_agi(pag, tp, &agibp);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+ /*
+ * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the
+ * incore unlinked list pointers for this inode. Check once more to
+ * see if we raced with anyone else to reload the unlinked list.
+ */
+ if (!xfs_inode_unlinked_incomplete(ip)) {
+ foundit = true;
+ goto out_agibp;
+ }
+
+ bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
+ agi = agibp->b_addr;
+
+ trace_xfs_inode_reload_unlinked_bucket(ip);
+
+ xfs_info_ratelimited(mp,
+ "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.",
+ agino, agno);
+
+ prev_agino = NULLAGINO;
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+ while (next_agino != NULLAGINO) {
+ struct xfs_inode *next_ip = NULL;
+
+ /* Found this caller's inode, set its backlink. */
+ if (next_agino == agino) {
+ next_ip = ip;
+ next_ip->i_prev_unlinked = prev_agino;
+ foundit = true;
+ goto next_inode;
+ }
+
+ /* Try in-memory lookup first. */
+ next_ip = xfs_iunlink_lookup(pag, next_agino);
+ if (next_ip)
+ goto next_inode;
+
+ /* Inode not in memory, try reloading it. */
+ error = xfs_iunlink_reload_next(tp, agibp, prev_agino,
+ next_agino);
+ if (error)
+ break;
+
+ /* Grab the reloaded inode. */
+ next_ip = xfs_iunlink_lookup(pag, next_agino);
+ if (!next_ip) {
+ /* No incore inode at all? We reloaded it... */
+ ASSERT(next_ip != NULL);
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+next_inode:
+ prev_agino = next_agino;
+ next_agino = next_ip->i_next_unlinked;
+ }
+
+out_agibp:
+ xfs_trans_brelse(tp, agibp);
+ /* Should have found this inode somewhere in the iunlinked bucket. */
+ if (!error && !foundit)
+ error = -EFSCORRUPTED;
+ return error;
+}
+
+/* Decide if this inode is missing its unlinked list and reload it. */
+int
+xfs_inode_reload_unlinked(
+ struct xfs_inode *ip)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc_empty(ip->i_mount, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_inode_unlinked_incomplete(ip))
+ error = xfs_inode_reload_unlinked_bucket(tp, ip);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ xfs_trans_cancel(tp);
+
+ return error;
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 7547caf2f2ab..0c5bdb91152e 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -68,8 +68,21 @@ typedef struct xfs_inode {
uint64_t i_diflags2; /* XFS_DIFLAG2_... */
struct timespec64 i_crtime; /* time created */
- /* unlinked list pointers */
+ /*
+ * Unlinked list pointers. These point to the next and previous inodes
+ * in the AGI unlinked bucket list, respectively. These fields can
+ * only be updated with the AGI locked.
+ *
+ * i_next_unlinked caches di_next_unlinked.
+ */
xfs_agino_t i_next_unlinked;
+
+ /*
+ * If the inode is not on an unlinked list, this field is zero. If the
+ * inode is the first element in an unlinked list, this field is
+ * NULLAGINO. Otherwise, i_prev_unlinked points to the previous inode
+ * in the unlinked list.
+ */
xfs_agino_t i_prev_unlinked;
/* VFS inode */
@@ -81,6 +94,11 @@ typedef struct xfs_inode {
struct list_head i_ioend_list;
} xfs_inode_t;
+static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip)
+{
+ return ip->i_prev_unlinked != 0;
+}
+
static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip)
{
return ip->i_forkoff > 0;
@@ -326,6 +344,9 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
*/
#define XFS_INACTIVATING (1 << 13)
+/* Quotacheck is running but inode has not been added to quota counts. */
+#define XFS_IQUOTAUNCHECKED (1 << 14)
+
/* All inode state flags related to inode reclaim. */
#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \
XFS_IRECLAIM | \
@@ -340,7 +361,7 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
#define XFS_IRECLAIM_RESET_FLAGS \
(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \
- XFS_INACTIVATING)
+ XFS_INACTIVATING | XFS_IQUOTAUNCHECKED)
/*
* Flags for inode locking.
@@ -575,4 +596,13 @@ void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+static inline bool
+xfs_inode_unlinked_incomplete(
+ struct xfs_inode *ip)
+{
+ return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip);
+}
+int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip);
+int xfs_inode_reload_unlinked(struct xfs_inode *ip);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 2ededd3f6b8c..1c1e6171209d 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -573,10 +573,10 @@ xfs_vn_getattr(
stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
+ stat->mtime = inode->i_mtime;
+ stat->ctime = inode_get_ctime(inode);
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
- fill_mg_cmtime(stat, request_mask, inode);
-
if (xfs_has_v3inodes(mp)) {
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
@@ -917,7 +917,7 @@ xfs_setattr_size(
if (newsize != oldsize &&
!(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
iattr->ia_ctime = iattr->ia_mtime =
- current_mgtime(inode);
+ current_time(inode);
iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
}
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c2093cb56092..f5377ba5967a 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -80,6 +80,17 @@ xfs_bulkstat_one_int(
if (error)
goto out;
+ /* Reload the incore unlinked list to avoid failure in inodegc. */
+ if (xfs_inode_unlinked_incomplete(ip)) {
+ error = xfs_inode_reload_unlinked_bucket(tp, ip);
+ if (error) {
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_irele(ip);
+ return error;
+ }
+ }
+
ASSERT(ip != NULL);
ASSERT(ip->i_imap.im_blkno != 0);
inode = VFS_I(ip);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 79004d193e54..51c100c86177 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -715,15 +715,7 @@ xfs_log_mount(
* just worked.
*/
if (!xfs_has_norecovery(mp)) {
- /*
- * log recovery ignores readonly state and so we need to clear
- * mount-based read only state so it can write to disk.
- */
- bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY,
- &mp->m_opstate);
error = xlog_recover(log);
- if (readonly)
- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
if (error) {
xfs_warn(mp, "log mount/recovery failed: error %d",
error);
@@ -772,7 +764,6 @@ xfs_log_mount_finish(
struct xfs_mount *mp)
{
struct xlog *log = mp->m_log;
- bool readonly;
int error = 0;
if (xfs_has_norecovery(mp)) {
@@ -781,12 +772,6 @@ xfs_log_mount_finish(
}
/*
- * log recovery ignores readonly state and so we need to clear
- * mount-based read only state so it can write to disk.
- */
- readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
-
- /*
* During the second phase of log recovery, we need iget and
* iput to behave like they do for an active filesystem.
* xfs_fs_drop_inode needs to be able to prevent the deletion
@@ -835,8 +820,6 @@ xfs_log_mount_finish(
xfs_buftarg_drain(mp->m_ddev_targp);
clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
- if (readonly)
- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
/* Make sure the log is dead if we're returning failure. */
ASSERT(!error || xlog_is_shutdown(log));
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index eccbfb99e894..ebc70aaa299c 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -124,7 +124,7 @@ xlog_cil_push_pcp_aggregate(
struct xlog_cil_pcp *cilpcp;
int cpu;
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &ctx->cil_pcpmask) {
cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
ctx->ticket->t_curr_res += cilpcp->space_reserved;
@@ -165,7 +165,13 @@ xlog_cil_insert_pcp_aggregate(
if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
return;
- for_each_online_cpu(cpu) {
+ /*
+ * We can race with other cpus setting cil_pcpmask. However, we've
+ * atomically cleared PCP_SPACE which forces other threads to add to
+ * the global space used count. cil_pcpmask is a superset of cilpcp
+ * structures that could have a nonzero space_used.
+ */
+ for_each_cpu(cpu, &ctx->cil_pcpmask) {
int old, prev;
cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
@@ -554,6 +560,7 @@ xlog_cil_insert_items(
int iovhdr_res = 0, split_res = 0, ctx_res = 0;
int space_used;
int order;
+ unsigned int cpu_nr;
struct xlog_cil_pcp *cilpcp;
ASSERT(tp);
@@ -577,7 +584,12 @@ xlog_cil_insert_items(
* can't be scheduled away between split sample/update operations that
* are done without outside locking to serialise them.
*/
- cilpcp = get_cpu_ptr(cil->xc_pcp);
+ cpu_nr = get_cpu();
+ cilpcp = this_cpu_ptr(cil->xc_pcp);
+
+ /* Tell the future push that there was work added by this CPU. */
+ if (!cpumask_test_cpu(cpu_nr, &ctx->cil_pcpmask))
+ cpumask_test_and_set_cpu(cpu_nr, &ctx->cil_pcpmask);
/*
* We need to take the CIL checkpoint unit reservation on the first
@@ -663,7 +675,7 @@ xlog_cil_insert_items(
continue;
list_add_tail(&lip->li_cil, &cilpcp->log_items);
}
- put_cpu_ptr(cilpcp);
+ put_cpu();
/*
* If we've overrun the reservation, dump the tx details before we move
@@ -1791,38 +1803,6 @@ out_shutdown:
}
/*
- * Move dead percpu state to the relevant CIL context structures.
- *
- * We have to lock the CIL context here to ensure that nothing is modifying
- * the percpu state, either addition or removal. Both of these are done under
- * the CIL context lock, so grabbing that exclusively here will ensure we can
- * safely drain the cilpcp for the CPU that is dying.
- */
-void
-xlog_cil_pcp_dead(
- struct xlog *log,
- unsigned int cpu)
-{
- struct xfs_cil *cil = log->l_cilp;
- struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
- struct xfs_cil_ctx *ctx;
-
- down_write(&cil->xc_ctx_lock);
- ctx = cil->xc_ctx;
- if (ctx->ticket)
- ctx->ticket->t_curr_res += cilpcp->space_reserved;
- cilpcp->space_reserved = 0;
-
- if (!list_empty(&cilpcp->log_items))
- list_splice_init(&cilpcp->log_items, &ctx->log_items);
- if (!list_empty(&cilpcp->busy_extents))
- list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
- atomic_add(cilpcp->space_used, &ctx->space_used);
- cilpcp->space_used = 0;
- up_write(&cil->xc_ctx_lock);
-}
-
-/*
* Perform initial CIL structure initialisation.
*/
int
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 1bd2963e8fbd..af87648331d5 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -231,6 +231,12 @@ struct xfs_cil_ctx {
struct work_struct discard_endio_work;
struct work_struct push_work;
atomic_t order_id;
+
+ /*
+ * CPUs that could have added items to the percpu CIL data. Access is
+ * coordinated with xc_ctx_lock.
+ */
+ struct cpumask cil_pcpmask;
};
/*
@@ -278,9 +284,6 @@ struct xfs_cil {
wait_queue_head_t xc_push_wait; /* background push throttle */
void __percpu *xc_pcp; /* percpu CIL structures */
-#ifdef CONFIG_HOTPLUG_CPU
- struct list_head xc_pcp_list;
-#endif
} ____cacheline_aligned_in_smp;
/* xc_flags bit values */
@@ -705,9 +708,4 @@ xlog_kvmalloc(
return p;
}
-/*
- * CIL CPU dead notifier
- */
-void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);
-
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 82c81d20459d..13b94d2e605b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -329,7 +329,7 @@ xlog_find_verify_cycle(
* try a smaller size. We need to be able to read at least
* a log sector, or we're out of luck.
*/
- bufblks = 1 << ffs(nbblks);
+ bufblks = roundup_pow_of_two(nbblks);
while (bufblks > log->l_logBBsize)
bufblks >>= 1;
while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
@@ -1528,7 +1528,7 @@ xlog_write_log_records(
* a smaller size. We need to be able to write at least a
* log sector, or we're out of luck.
*/
- bufblks = 1 << ffs(blocks);
+ bufblks = roundup_pow_of_two(blocks);
while (bufblks > log->l_logBBsize)
bufblks >>= 1;
while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a25eece3be2b..d19cca099bc3 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -60,6 +60,7 @@ struct xfs_error_cfg {
* Per-cpu deferred inode inactivation GC lists.
*/
struct xfs_inodegc {
+ struct xfs_mount *mp;
struct llist_head list;
struct delayed_work work;
int error;
@@ -67,9 +68,7 @@ struct xfs_inodegc {
/* approximate count of inodes in the list */
unsigned int items;
unsigned int shrinker_hits;
-#if defined(DEBUG) || defined(XFS_WARN)
unsigned int cpu;
-#endif
};
/*
@@ -98,7 +97,6 @@ typedef struct xfs_mount {
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
- struct list_head m_mount_list; /* global mount list */
void __percpu *m_inodegc; /* percpu inodegc structures */
/*
@@ -249,6 +247,9 @@ typedef struct xfs_mount {
unsigned int *m_errortag;
struct xfs_kobj m_errortag_kobj;
#endif
+
+ /* cpus that have inodes queued for inactivation */
+ struct cpumask m_inodegc_cpumask;
} xfs_mount_t;
#define M_IGEO(mp) (&(mp)->m_ino_geo)
@@ -404,6 +405,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_WARNED_SHRINK 8
/* Kernel has logged a warning about logged xattr updates being used. */
#define XFS_OPSTATE_WARNED_LARP 9
+/* Mount time quotacheck is running */
+#define XFS_OPSTATE_QUOTACHECK_RUNNING 10
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -426,6 +429,11 @@ __XFS_IS_OPSTATE(inode32, INODE32)
__XFS_IS_OPSTATE(readonly, READONLY)
__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED)
__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
+#ifdef CONFIG_XFS_QUOTA
+__XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING)
+#else
+# define xfs_is_quotacheck_running(mp) (false)
+#endif
static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
@@ -443,7 +451,8 @@ xfs_should_warn(struct xfs_mount *mp, long nr)
{ (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \
{ (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \
{ (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \
- { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }
+ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \
+ { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }
/*
* Max and min values for mount-option defined I/O
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 6abcc34fafd8..086e78a6143a 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1160,6 +1160,19 @@ xfs_qm_dqusage_adjust(
if (error)
return error;
+ /*
+ * Reload the incore unlinked list to avoid failure in inodegc.
+ * Use an unlocked check here because unrecovered unlinked inodes
+ * should be somewhat rare.
+ */
+ if (xfs_inode_unlinked_incomplete(ip)) {
+ error = xfs_inode_reload_unlinked(ip);
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ goto error0;
+ }
+ }
+
ASSERT(ip->i_delayed_blks == 0);
if (XFS_IS_REALTIME_INODE(ip)) {
@@ -1173,6 +1186,7 @@ xfs_qm_dqusage_adjust(
}
nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks;
+ xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED);
/*
* Add the (disk blocks and inode) resources occupied by this
@@ -1319,8 +1333,10 @@ xfs_qm_quotacheck(
flags |= XFS_PQUOTA_CHKD;
}
+ xfs_set_quotacheck_running(mp);
error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
NULL);
+ xfs_clear_quotacheck_running(mp);
/*
* On error, the inode walk may have partially populated the dquot
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index edd8587658d5..2d4444d61e98 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -477,6 +477,7 @@ xfs_cui_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
+ struct xfs_trans_res resv;
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
@@ -514,8 +515,9 @@ xfs_cui_item_recover(
* doesn't fit. We need to reserve enough blocks to handle a
* full btree split on either end of the refcount range.
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv, mp->m_refc_maxlevels * 2, 0,
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 520c7ebdfed8..0e0e747028da 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -507,6 +507,7 @@ xfs_rui_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
+ struct xfs_trans_res resv;
struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
@@ -530,8 +531,9 @@ xfs_rui_item_recover(
}
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp);
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv, mp->m_rmap_maxlevels, 0,
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
rudp = xfs_trans_get_rud(tp, ruip);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1f77014c6e1a..819a3568b28f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -56,28 +56,6 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */
static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#endif
-#ifdef CONFIG_HOTPLUG_CPU
-static LIST_HEAD(xfs_mount_list);
-static DEFINE_SPINLOCK(xfs_mount_list_lock);
-
-static inline void xfs_mount_list_add(struct xfs_mount *mp)
-{
- spin_lock(&xfs_mount_list_lock);
- list_add(&mp->m_mount_list, &xfs_mount_list);
- spin_unlock(&xfs_mount_list_lock);
-}
-
-static inline void xfs_mount_list_del(struct xfs_mount *mp)
-{
- spin_lock(&xfs_mount_list_lock);
- list_del(&mp->m_mount_list);
- spin_unlock(&xfs_mount_list_lock);
-}
-#else /* !CONFIG_HOTPLUG_CPU */
-static inline void xfs_mount_list_add(struct xfs_mount *mp) {}
-static inline void xfs_mount_list_del(struct xfs_mount *mp) {}
-#endif
-
enum xfs_dax_mode {
XFS_DAX_INODE = 0,
XFS_DAX_ALWAYS = 1,
@@ -1135,9 +1113,8 @@ xfs_inodegc_init_percpu(
for_each_possible_cpu(cpu) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
-#if defined(DEBUG) || defined(XFS_WARN)
gc->cpu = cpu;
-#endif
+ gc->mp = mp;
init_llist_head(&gc->list);
gc->items = 0;
gc->error = 0;
@@ -1168,7 +1145,6 @@ xfs_fs_put_super(
xfs_freesb(mp);
xchk_mount_stats_free(mp);
free_percpu(mp->m_stats.xs_stats);
- xfs_mount_list_del(mp);
xfs_inodegc_free_percpu(mp);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
@@ -1577,13 +1553,6 @@ xfs_fs_fill_super(
if (error)
goto out_destroy_counters;
- /*
- * All percpu data structures requiring cleanup when a cpu goes offline
- * must be allocated before adding this @mp to the cpu-dead handler's
- * mount list.
- */
- xfs_mount_list_add(mp);
-
/* Allocate stats memory before we do operations that might use it */
mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
if (!mp->m_stats.xs_stats) {
@@ -1781,7 +1750,6 @@ xfs_fs_fill_super(
out_free_stats:
free_percpu(mp->m_stats.xs_stats);
out_destroy_inodegc:
- xfs_mount_list_del(mp);
xfs_inodegc_free_percpu(mp);
out_destroy_counters:
xfs_destroy_percpu_counters(mp);
@@ -2065,7 +2033,7 @@ static struct file_system_type xfs_fs_type = {
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb = xfs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("xfs");
@@ -2326,49 +2294,6 @@ xfs_destroy_workqueues(void)
destroy_workqueue(xfs_alloc_wq);
}
-#ifdef CONFIG_HOTPLUG_CPU
-static int
-xfs_cpu_dead(
- unsigned int cpu)
-{
- struct xfs_mount *mp, *n;
-
- spin_lock(&xfs_mount_list_lock);
- list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) {
- spin_unlock(&xfs_mount_list_lock);
- xfs_inodegc_cpu_dead(mp, cpu);
- xlog_cil_pcp_dead(mp->m_log, cpu);
- spin_lock(&xfs_mount_list_lock);
- }
- spin_unlock(&xfs_mount_list_lock);
- return 0;
-}
-
-static int __init
-xfs_cpu_hotplug_init(void)
-{
- int error;
-
- error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL,
- xfs_cpu_dead);
- if (error < 0)
- xfs_alert(NULL,
-"Failed to initialise CPU hotplug, error %d. XFS is non-functional.",
- error);
- return error;
-}
-
-static void
-xfs_cpu_hotplug_destroy(void)
-{
- cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD);
-}
-
-#else /* !CONFIG_HOTPLUG_CPU */
-static inline int xfs_cpu_hotplug_init(void) { return 0; }
-static inline void xfs_cpu_hotplug_destroy(void) {}
-#endif
-
STATIC int __init
init_xfs_fs(void)
{
@@ -2385,13 +2310,9 @@ init_xfs_fs(void)
xfs_dir_startup();
- error = xfs_cpu_hotplug_init();
- if (error)
- goto out;
-
error = xfs_init_caches();
if (error)
- goto out_destroy_hp;
+ goto out;
error = xfs_init_workqueues();
if (error)
@@ -2475,8 +2396,6 @@ init_xfs_fs(void)
xfs_destroy_workqueues();
out_destroy_caches:
xfs_destroy_caches();
- out_destroy_hp:
- xfs_cpu_hotplug_destroy();
out:
return error;
}
@@ -2500,7 +2419,6 @@ exit_xfs_fs(void)
xfs_destroy_workqueues();
xfs_destroy_caches();
xfs_uuid_table_free();
- xfs_cpu_hotplug_destroy();
}
module_init(init_xfs_fs);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 902c7f67a117..3926cf7f2a6e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3824,6 +3824,51 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
__entry->new_ptr)
);
+TRACE_EVENT(xfs_iunlink_reload_next,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->prev_agino = ip->i_prev_unlinked;
+ __entry->next_agino = ip->i_next_unlinked;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xfs_inode_reload_unlinked_bucket,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ ),
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x bucket %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS)
+);
+
DECLARE_EVENT_CLASS(xfs_ag_inode_class,
TP_PROTO(struct xfs_inode *ip),
TP_ARGS(ip),
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 43e5c219aaed..a3975f325f4e 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -46,6 +46,17 @@ xfs_attr_grab_log_assist(
if (xfs_sb_version_haslogxattrs(&mp->m_sb))
return 0;
+ /*
+ * Check if the filesystem featureset is new enough to set this log
+ * incompat feature bit. Strictly speaking, the minimum requirement is
+ * a V5 filesystem for the superblock field, but we'll require rmap
+ * or reflink to avoid having to deal with really old kernels.
+ */
+ if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) {
+ error = -EOPNOTSUPP;
+ goto drop_incompat;
+ }
+
/* Enable log-assisted xattrs. */
error = xfs_add_incompat_log_feature(mp,
XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);