summaryrefslogtreecommitdiff
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2018-02-06 21:12:31 +0100
committerIngo Molnar <mingo@kernel.org>2018-02-06 21:12:31 +0100
commit82845079160817cc6ac64e5321bbd935e0a47b3a (patch)
tree0886d1d52428e9db14536cae4b37db896e7c360a /fs/ocfs2
parent32e839dda3ba576943365f0f5817ce5c843137dc (diff)
parent68c5735eaa5e680e701c9a2d1e3c7880bdf5ab66 (diff)
Merge branch 'linus' into sched/urgent, to resolve conflicts
Conflicts: arch/arm64/kernel/entry.S arch/x86/Kconfig include/linux/sched/mm.h kernel/fork.c Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/acl.c6
-rw-r--r--fs/ocfs2/alloc.c261
-rw-r--r--fs/ocfs2/alloc.h1
-rw-r--r--fs/ocfs2/aops.c10
-rw-r--r--fs/ocfs2/cluster/quorum.c5
-rw-r--r--fs/ocfs2/cluster/tcp.c3
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h2
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c7
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmglue.c136
-rw-r--r--fs/ocfs2/dlmglue.h35
-rw-r--r--fs/ocfs2/extent_map.c45
-rw-r--r--fs/ocfs2/extent_map.h3
-rw-r--r--fs/ocfs2/file.c101
-rw-r--r--fs/ocfs2/journal.c23
-rw-r--r--fs/ocfs2/mmap.c2
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/ocfs2_trace.h10
-rw-r--r--fs/ocfs2/suballoc.c8
-rw-r--r--fs/ocfs2/super.c13
-rw-r--r--fs/ocfs2/xattr.c5
23 files changed, 602 insertions, 86 deletions
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 40b5cc97f7b0..917fadca8a7b 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -311,7 +311,9 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
if (had_lock < 0)
return ERR_PTR(had_lock);
+ down_read(&OCFS2_I(inode)->ip_xattr_sem);
acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+ up_read(&OCFS2_I(inode)->ip_xattr_sem);
ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
brelse(di_bh);
@@ -330,7 +332,9 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
return 0;
+ down_read(&OCFS2_I(inode)->ip_xattr_sem);
acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
+ up_read(&OCFS2_I(inode)->ip_xattr_sem);
if (IS_ERR(acl) || !acl)
return PTR_ERR(acl);
ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
@@ -361,8 +365,10 @@ int ocfs2_init_acl(handle_t *handle,
if (!S_ISLNK(inode->i_mode)) {
if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ down_read(&OCFS2_I(dir)->ip_xattr_sem);
acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
dir_bh);
+ up_read(&OCFS2_I(dir)->ip_xattr_sem);
if (IS_ERR(acl))
return PTR_ERR(acl);
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ab5105f9767e..9a876bb07cac 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec);
static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
+
+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct buffer_head **new_eb_bh,
+ int blk_wanted, int *blk_given);
+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
+
static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
.eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
@@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
if (!obj)
obj = (void *)bh->b_data;
et->et_object = obj;
+ et->et_dealloc = NULL;
et->et_ops->eo_fill_root_el(et);
if (!et->et_ops->eo_fill_max_leaf_clusters)
@@ -1158,7 +1166,7 @@ static int ocfs2_add_branch(handle_t *handle,
struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
{
- int status, new_blocks, i;
+ int status, new_blocks, i, block_given = 0;
u64 next_blkno, new_last_eb_blk;
struct buffer_head *bh;
struct buffer_head **new_eb_bhs = NULL;
@@ -1213,11 +1221,31 @@ static int ocfs2_add_branch(handle_t *handle,
goto bail;
}
- status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
- meta_ac, new_eb_bhs);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ /* Firstyly, try to reuse dealloc since we have already estimated how
+ * many extent blocks we may use.
+ */
+ if (!ocfs2_is_dealloc_empty(et)) {
+ status = ocfs2_reuse_blk_from_dealloc(handle, et,
+ new_eb_bhs, new_blocks,
+ &block_given);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ BUG_ON(block_given > new_blocks);
+
+ if (block_given < new_blocks) {
+ BUG_ON(!meta_ac);
+ status = ocfs2_create_new_meta_bhs(handle, et,
+ new_blocks - block_given,
+ meta_ac,
+ &new_eb_bhs[block_given]);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
}
/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -1340,15 +1368,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **ret_new_eb_bh)
{
- int status, i;
+ int status, i, block_given = 0;
u32 new_clusters;
struct buffer_head *new_eb_bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *root_el;
struct ocfs2_extent_list *eb_el;
- status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
- &new_eb_bh);
+ if (!ocfs2_is_dealloc_empty(et)) {
+ status = ocfs2_reuse_blk_from_dealloc(handle, et,
+ &new_eb_bh, 1,
+ &block_given);
+ } else if (meta_ac) {
+ status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
+ &new_eb_bh);
+
+ } else {
+ BUG();
+ }
+
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1511,7 +1549,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
int depth = le16_to_cpu(el->l_tree_depth);
struct buffer_head *bh = NULL;
- BUG_ON(meta_ac == NULL);
+ BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
shift = ocfs2_find_branch_target(et, &bh);
if (shift < 0) {
@@ -2598,11 +2636,8 @@ static void ocfs2_unlink_subtree(handle_t *handle,
int i;
struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
- struct ocfs2_extent_list *el;
struct ocfs2_extent_block *eb;
- el = path_leaf_el(left_path);
-
eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
@@ -3938,7 +3973,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
struct ocfs2_path *path,
struct ocfs2_extent_rec *insert_rec)
{
- int ret, i, next_free;
+ int i, next_free;
struct buffer_head *bh;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
@@ -3955,7 +3990,6 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
"Owner %llu has a bad extent list\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
- ret = -EIO;
return;
}
@@ -5057,7 +5091,6 @@ int ocfs2_split_extent(handle_t *handle,
struct buffer_head *last_eb_bh = NULL;
struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
struct ocfs2_merge_ctxt ctxt;
- struct ocfs2_extent_list *rightmost_el;
if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
@@ -5093,9 +5126,7 @@ int ocfs2_split_extent(handle_t *handle,
}
eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- rightmost_el = &eb->h_list;
- } else
- rightmost_el = path_root_el(path);
+ }
if (rec->e_cpos == split_rec->e_cpos &&
rec->e_leaf_clusters == split_rec->e_leaf_clusters)
@@ -6585,6 +6616,154 @@ ocfs2_find_per_slot_free_list(int type,
return fl;
}
+static struct ocfs2_per_slot_free_list *
+ocfs2_find_preferred_free_list(int type,
+ int preferred_slot,
+ int *real_slot,
+ struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+ struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
+
+ while (fl) {
+ if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
+ *real_slot = fl->f_slot;
+ return fl;
+ }
+
+ fl = fl->f_next_suballocator;
+ }
+
+ /* If we can't find any free list matching preferred slot, just use
+ * the first one.
+ */
+ fl = ctxt->c_first_suballocator;
+ *real_slot = fl->f_slot;
+
+ return fl;
+}
+
+/* Return Value 1 indicates empty */
+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_per_slot_free_list *fl = NULL;
+
+ if (!et->et_dealloc)
+ return 1;
+
+ fl = et->et_dealloc->c_first_suballocator;
+ if (!fl)
+ return 1;
+
+ if (!fl->f_first)
+ return 1;
+
+ return 0;
+}
+
+/* If extent was deleted from tree due to extent rotation and merging, and
+ * no metadata is reserved ahead of time. Try to reuse some extents
+ * just deleted. This is only used to reuse extent blocks.
+ * It is supposed to find enough extent blocks in dealloc if our estimation
+ * on metadata is accurate.
+ */
+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct buffer_head **new_eb_bh,
+ int blk_wanted, int *blk_given)
+{
+ int i, status = 0, real_slot;
+ struct ocfs2_cached_dealloc_ctxt *dealloc;
+ struct ocfs2_per_slot_free_list *fl;
+ struct ocfs2_cached_block_free *bf;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_super *osb =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
+
+ *blk_given = 0;
+
+ /* If extent tree doesn't have a dealloc, this is not faulty. Just
+ * tell upper caller dealloc can't provide any block and it should
+ * ask for alloc to claim more space.
+ */
+ dealloc = et->et_dealloc;
+ if (!dealloc)
+ goto bail;
+
+ for (i = 0; i < blk_wanted; i++) {
+ /* Prefer to use local slot */
+ fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
+ osb->slot_num, &real_slot,
+ dealloc);
+ /* If no more block can be reused, we should claim more
+ * from alloc. Just return here normally.
+ */
+ if (!fl) {
+ status = 0;
+ break;
+ }
+
+ bf = fl->f_first;
+ fl->f_first = bf->free_next;
+
+ new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
+ if (new_eb_bh[i] == NULL) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ mlog(0, "Reusing block(%llu) from "
+ "dealloc(local slot:%d, real slot:%d)\n",
+ bf->free_blk, osb->slot_num, real_slot);
+
+ ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
+
+ status = ocfs2_journal_access_eb(handle, et->et_ci,
+ new_eb_bh[i],
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
+ eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
+
+ /* We can't guarantee that buffer head is still cached, so
+ * polutlate the extent block again.
+ */
+ strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+ eb->h_blkno = cpu_to_le64(bf->free_blk);
+ eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+ eb->h_suballoc_slot = cpu_to_le16(real_slot);
+ eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
+ eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
+ eb->h_list.l_count =
+ cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+ /* We'll also be dirtied by the caller, so
+ * this isn't absolutely necessary.
+ */
+ ocfs2_journal_dirty(handle, new_eb_bh[i]);
+
+ if (!fl->f_first) {
+ dealloc->c_first_suballocator = fl->f_next_suballocator;
+ kfree(fl);
+ }
+ kfree(bf);
+ }
+
+ *blk_given = i;
+
+bail:
+ if (unlikely(status < 0)) {
+ for (i = 0; i < blk_wanted; i++)
+ brelse(new_eb_bh[i]);
+ }
+
+ return status;
+}
+
int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
int type, int slot, u64 suballoc,
u64 blkno, unsigned int bit)
@@ -7382,6 +7561,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
+ struct ocfs2_trim_fs_info info, *pinfo = NULL;
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
@@ -7419,6 +7599,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
trace_ocfs2_trim_fs(start, len, minlen);
+ ocfs2_trim_fs_lock_res_init(osb);
+ ret = ocfs2_trim_fs_lock(osb, NULL, 1);
+ if (ret < 0) {
+ if (ret != -EAGAIN) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ goto out_unlock;
+ }
+
+ mlog(ML_NOTICE, "Wait for trim on device (%s) to "
+ "finish, which is running from another node.\n",
+ osb->dev_str);
+ ret = ocfs2_trim_fs_lock(osb, &info, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ goto out_unlock;
+ }
+
+ if (info.tf_valid && info.tf_success &&
+ info.tf_start == start && info.tf_len == len &&
+ info.tf_minlen == minlen) {
+ /* Avoid sending duplicated trim to a shared device */
+ mlog(ML_NOTICE, "The same trim on device (%s) was "
+ "just done from node (%u), return.\n",
+ osb->dev_str, info.tf_nodenum);
+ range->len = info.tf_trimlen;
+ goto out_trimunlock;
+ }
+ }
+
+ info.tf_nodenum = osb->node_num;
+ info.tf_start = start;
+ info.tf_len = len;
+ info.tf_minlen = minlen;
+
/* Determine first and last group to examine based on start and len */
first_group = ocfs2_which_cluster_group(main_bm_inode, start);
if (first_group == osb->first_cluster_group_blkno)
@@ -7463,6 +7679,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
}
range->len = trimmed * sb->s_blocksize;
+
+ info.tf_trimlen = range->len;
+ info.tf_success = (ret ? 0 : 1);
+ pinfo = &info;
+out_trimunlock:
+ ocfs2_trim_fs_unlock(osb, pinfo);
+ ocfs2_trim_fs_lock_res_uninit(osb);
out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 27b75cf32cfa..250bcacdf9e9 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
ocfs2_journal_access_func et_root_journal_access;
void *et_object;
unsigned int et_max_leaf_clusters;
+ struct ocfs2_cached_dealloc_ctxt *et_dealloc;
};
/*
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index d1516327b787..e8e205bf2e41 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -797,6 +797,7 @@ struct ocfs2_write_ctxt {
struct ocfs2_cached_dealloc_ctxt w_dealloc;
struct list_head w_unwritten_list;
+ unsigned int w_unwritten_count;
};
void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1386,6 +1387,7 @@ retry:
desc->c_clear_unwritten = 0;
list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+ wc->w_unwritten_count++;
new = NULL;
unlock:
spin_unlock(&oi->ip_lock);
@@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
ue->ue_phys = desc->c_phys;
list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
- dwc->dw_zero_count++;
+ dwc->dw_zero_count += wc->w_unwritten_count;
}
ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
@@ -2330,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+ /* Attach dealloc with extent tree in case that we may reuse extents
+ * which are already unlinked from current extent tree due to extent
+ * rotation and merging.
+ */
+ et.et_dealloc = &dealloc;
+
ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
&data_ac, &meta_ac);
if (ret) {
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 62e8ec619b4c..af2e7473956e 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -314,12 +314,13 @@ void o2quo_conn_err(u8 node)
node, qs->qs_connected);
clear_bit(node, qs->qs_conn_bm);
+
+ if (test_bit(node, qs->qs_hb_bm))
+ o2quo_set_hold(qs, node);
}
mlog(0, "node %u, %d total\n", node, qs->qs_connected);
- if (test_bit(node, qs->qs_hb_bm))
- o2quo_set_hold(qs, node);
spin_unlock(&qs->qs_lock);
}
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index bebe59feca58..eac5140aac47 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -918,7 +918,8 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
{
struct kvec vec = { .iov_len = len, .iov_base = data, };
struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
- return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags);
+ iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, len);
+ return sock_recvmsg(sock, &msg, MSG_DONTWAIT);
}
static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index b95e7df5b76a..0276f7f8d5e6 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -196,7 +196,7 @@ struct o2net_msg_handler {
u32 nh_msg_type;
u32 nh_key;
o2net_msg_handler_func *nh_func;
- o2net_msg_handler_func *nh_func_data;
+ void *nh_func_data;
o2net_post_msg_handler_func
*nh_post_func;
struct kref nh_kref;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 32f9c72dff17..b7520e20a770 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1958,7 +1958,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx)
trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
- error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
+ error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1);
if (lock_level && error >= 0) {
/* We release EX lock which used to update atime
* and get PR lock again to reduce contention
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9c3e0f13ca87..a7df226f9449 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1122,13 +1122,6 @@ recheck:
/* sleep if we haven't finished voting yet */
if (sleep) {
unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
-
- /*
- if (kref_read(&mle->mle_refs) < 2)
- mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
- kref_read(&mle->mle_refs),
- res->lockname.len, res->lockname.name);
- */
atomic_set(&mle->woken, 0);
(void)wait_event_timeout(mle->wq,
(atomic_read(&mle->woken) == 1),
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 9c7c18c0e129..385fcefa8bc5 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -220,9 +220,9 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
return 0;
}
-static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
+static __poll_t dlmfs_file_poll(struct file *file, poll_table *wait)
{
- int event = 0;
+ __poll_t event = 0;
struct inode *inode = file_inode(file);
struct dlmfs_inode_private *ip = DLMFS_I(inode);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 4689940a953c..9479f99c2145 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -259,6 +259,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = {
+ .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
.flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
};
@@ -676,6 +680,24 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_nfs_sync_lops, osb);
}
+void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
+{
+ struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+ ocfs2_lock_res_init_once(lockres);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
+ ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
+ &ocfs2_trim_fs_lops, osb);
+}
+
+void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
+{
+ struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+ ocfs2_simple_drop_lockres(osb, lockres);
+ ocfs2_lock_res_free(lockres);
+}
+
static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
struct ocfs2_super *osb)
{
@@ -1742,6 +1764,27 @@ int ocfs2_rw_lock(struct inode *inode, int write)
return status;
}
+int ocfs2_try_rw_lock(struct inode *inode, int write)
+{
+ int status, level;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ mlog(0, "inode %llu try to take %s RW lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ write ? "EXMODE" : "PRMODE");
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+ level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+ status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
+ return status;
+}
+
void ocfs2_rw_unlock(struct inode *inode, int write)
{
int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
@@ -2486,6 +2529,15 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
if (ret == -EAGAIN) {
unlock_page(page);
+ /*
+ * If we can't get inode lock immediately, we should not return
+ * directly here, since this will lead to a softlockup problem.
+ * The method is to get a blocking lock and immediately unlock
+ * before returning, this can avoid CPU resource waste due to
+ * lots of retries, and benefits fairness in getting lock.
+ */
+ if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
+ ocfs2_inode_unlock(inode, ex);
ret = AOP_TRUNCATED_PAGE;
}
@@ -2494,13 +2546,18 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
int ocfs2_inode_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
- int *level)
+ int *level, int wait)
{
int ret;
- ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (wait)
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ else
+ ret = ocfs2_try_inode_lock(inode, NULL, 0);
+
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
return ret;
}
@@ -2512,9 +2569,14 @@ int ocfs2_inode_lock_atime(struct inode *inode,
struct buffer_head *bh = NULL;
ocfs2_inode_unlock(inode, 0);
- ret = ocfs2_inode_lock(inode, &bh, 1);
+ if (wait)
+ ret = ocfs2_inode_lock(inode, &bh, 1);
+ else
+ ret = ocfs2_try_inode_lock(inode, &bh, 1);
+
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
return ret;
}
*level = 1;
@@ -2745,6 +2807,70 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
ex ? LKM_EXMODE : LKM_PRMODE);
}
+int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
+ struct ocfs2_trim_fs_info *info, int trylock)
+{
+ int status;
+ struct ocfs2_trim_fs_lvb *lvb;
+ struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+ if (info)
+ info->tf_valid = 0;
+
+ if (ocfs2_is_hard_readonly(osb))
+ return -EROFS;
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX,
+ trylock ? DLM_LKF_NOQUEUE : 0, 0);
+ if (status < 0) {
+ if (status != -EAGAIN)
+ mlog_errno(status);
+ return status;
+ }
+
+ if (info) {
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+ lvb->lvb_version == OCFS2_TRIMFS_LVB_VERSION) {
+ info->tf_valid = 1;
+ info->tf_success = lvb->lvb_success;
+ info->tf_nodenum = be32_to_cpu(lvb->lvb_nodenum);
+ info->tf_start = be64_to_cpu(lvb->lvb_start);
+ info->tf_len = be64_to_cpu(lvb->lvb_len);
+ info->tf_minlen = be64_to_cpu(lvb->lvb_minlen);
+ info->tf_trimlen = be64_to_cpu(lvb->lvb_trimlen);
+ }
+ }
+
+ return status;
+}
+
+void ocfs2_trim_fs_unlock(struct ocfs2_super *osb,
+ struct ocfs2_trim_fs_info *info)
+{
+ struct ocfs2_trim_fs_lvb *lvb;
+ struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+ if (ocfs2_mount_local(osb))
+ return;
+
+ if (info) {
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb->lvb_version = OCFS2_TRIMFS_LVB_VERSION;
+ lvb->lvb_success = info->tf_success;
+ lvb->lvb_nodenum = cpu_to_be32(info->tf_nodenum);
+ lvb->lvb_start = cpu_to_be64(info->tf_start);
+ lvb->lvb_len = cpu_to_be64(info->tf_len);
+ lvb->lvb_minlen = cpu_to_be64(info->tf_minlen);
+ lvb->lvb_trimlen = cpu_to_be64(info->tf_trimlen);
+ }
+
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
+}
+
int ocfs2_dentry_lock(struct dentry *dentry, int ex)
{
int ret;
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index a7fc18ba0dc1..256e0a9067b8 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -70,6 +70,29 @@ struct ocfs2_orphan_scan_lvb {
__be32 lvb_os_seqno;
};
+#define OCFS2_TRIMFS_LVB_VERSION 1
+
+struct ocfs2_trim_fs_lvb {
+ __u8 lvb_version;
+ __u8 lvb_success;
+ __u8 lvb_reserved[2];
+ __be32 lvb_nodenum;
+ __be64 lvb_start;
+ __be64 lvb_len;
+ __be64 lvb_minlen;
+ __be64 lvb_trimlen;
+};
+
+struct ocfs2_trim_fs_info {
+ u8 tf_valid; /* lvb is valid, or not */
+ u8 tf_success; /* trim is successful, or not */
+ u32 tf_nodenum; /* osb node number */
+ u64 tf_start; /* trim start offset in clusters */
+ u64 tf_len; /* trim end offset in clusters */
+ u64 tf_minlen; /* trim minimum contiguous free clusters */
+ u64 tf_trimlen; /* trimmed length in bytes */
+};
+
struct ocfs2_lock_holder {
struct list_head oh_list;
struct pid *oh_owner_pid;
@@ -116,13 +139,14 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
int ocfs2_create_new_inode_locks(struct inode *inode);
int ocfs2_drop_inode_locks(struct inode *inode);
int ocfs2_rw_lock(struct inode *inode, int write);
+int ocfs2_try_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
int ocfs2_open_lock(struct inode *inode);
int ocfs2_try_open_lock(struct inode *inode, int write);
void ocfs2_open_unlock(struct inode *inode);
int ocfs2_inode_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
- int *level);
+ int *level, int wait);
int ocfs2_inode_lock_full_nested(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
@@ -140,6 +164,9 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
/* 99% of the time we don't want to supply any additional flags --
* those are for very specific cases only. */
#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
+#define ocfs2_try_inode_lock(i, b, e)\
+ ocfs2_inode_lock_full_nested(i, b, e, OCFS2_META_LOCK_NOQUEUE,\
+ OI_LS_NORMAL)
void ocfs2_inode_unlock(struct inode *inode,
int ex);
int ocfs2_super_lock(struct ocfs2_super *osb,
@@ -153,6 +180,12 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
+void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb);
+void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb);
+int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
+ struct ocfs2_trim_fs_info *info, int trylock);
+void ocfs2_trim_fs_unlock(struct ocfs2_super *osb,
+ struct ocfs2_trim_fs_info *info);
int ocfs2_dentry_lock(struct dentry *dentry, int ex);
void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
int ocfs2_file_lock(struct file *file, int ex, int trylock);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index e4719e0a3f99..06cb96462bf9 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -38,6 +38,7 @@
#include "inode.h"
#include "super.h"
#include "symlink.h"
+#include "aops.h"
#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -832,6 +833,50 @@ out:
return ret;
}
+/* Is IO overwriting allocated blocks? */
+int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh,
+ u64 map_start, u64 map_len)
+{
+ int ret = 0, is_last;
+ u32 mapping_end, cpos;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_extent_rec rec;
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ if (ocfs2_size_fits_inline_data(di_bh, map_start + map_len))
+ return ret;
+ else
+ return -EAGAIN;
+ }
+
+ cpos = map_start >> osb->s_clustersize_bits;
+ mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+ map_start + map_len);
+ is_last = 0;
+ while (cpos < mapping_end && !is_last) {
+ ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+ NULL, &rec, &is_last);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (rec.e_blkno == 0ULL)
+ break;
+
+ if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
+ break;
+
+ cpos = le32_to_cpu(rec.e_cpos) +
+ le16_to_cpu(rec.e_leaf_clusters);
+ }
+
+ if (cpos < mapping_end)
+ ret = -EAGAIN;
+out:
+ return ret;
+}
+
int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
{
struct inode *inode = file->f_mapping->host;
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 67ea57d2fd59..1057586ec19f 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,6 +53,9 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 map_start, u64 map_len);
+int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh,
+ u64 map_start, u64 map_len);
+
int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a1d051055472..5d1784a365a3 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -140,6 +140,8 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
spin_unlock(&oi->ip_lock);
}
+ file->f_mode |= FMODE_NOWAIT;
+
leave:
return status;
}
@@ -2132,12 +2134,12 @@ out:
}
static int ocfs2_prepare_inode_for_write(struct file *file,
- loff_t pos,
- size_t count)
+ loff_t pos, size_t count, int wait)
{
- int ret = 0, meta_level = 0;
+ int ret = 0, meta_level = 0, overwrite_io = 0;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
+ struct buffer_head *di_bh = NULL;
loff_t end;
/*
@@ -2145,13 +2147,40 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* if we need to make modifications here.
*/
for(;;) {
- ret = ocfs2_inode_lock(inode, NULL, meta_level);
+ if (wait)
+ ret = ocfs2_inode_lock(inode, NULL, meta_level);
+ else
+ ret = ocfs2_try_inode_lock(inode,
+ overwrite_io ? NULL : &di_bh, meta_level);
if (ret < 0) {
meta_level = -1;
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto out;
}
+ /*
+ * Check if IO will overwrite allocated blocks in case
+ * IOCB_NOWAIT flag is set.
+ */
+ if (!wait && !overwrite_io) {
+ overwrite_io = 1;
+ if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
+ ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
+ brelse(di_bh);
+ di_bh = NULL;
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ if (ret < 0) {
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+ }
+
/* Clear suid / sgid if necessary. We do this here
* instead of later in the write path because
* remove_suid() calls ->setattr without any hint that
@@ -2199,7 +2228,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
- pos, count);
+ pos, count, wait);
+
+ brelse(di_bh);
if (meta_level >= 0)
ocfs2_inode_unlock(inode, meta_level);
@@ -2211,7 +2242,7 @@ out:
static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
- int direct_io, rw_level;
+ int rw_level;
ssize_t written = 0;
ssize_t ret;
size_t count = iov_iter_count(from);
@@ -2223,6 +2254,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
void *saved_ki_complete = NULL;
int append_write = ((iocb->ki_pos + count) >=
i_size_read(inode) ? 1 : 0);
+ int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
+ int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2230,12 +2263,17 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
file->f_path.dentry->d_name.name,
(unsigned int)from->nr_segs); /* GRRRRR */
+ if (!direct_io && nowait)
+ return -EOPNOTSUPP;
+
if (count == 0)
return 0;
- direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
-
- inode_lock(inode);
+ if (nowait) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ } else
+ inode_lock(inode);
/*
* Concurrent O_DIRECT writes are allowed with
@@ -2244,9 +2282,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
*/
rw_level = (!direct_io || full_coherency || append_write);
- ret = ocfs2_rw_lock(inode, rw_level);
+ if (nowait)
+ ret = ocfs2_try_rw_lock(inode, rw_level);
+ else
+ ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto out_mutex;
}
@@ -2260,9 +2302,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
* other nodes to drop their caches. Buffered I/O
* already does this in write_begin().
*/
- ret = ocfs2_inode_lock(inode, NULL, 1);
+ if (nowait)
+ ret = ocfs2_try_inode_lock(inode, NULL, 1);
+ else
+ ret = ocfs2_inode_lock(inode, NULL, 1);
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto out;
}
@@ -2277,9 +2323,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
}
count = ret;
- ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
+ ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto out;
}
@@ -2355,6 +2402,8 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
int ret = 0, rw_level = -1, lock_level = 0;
struct file *filp = iocb->ki_filp;
struct inode *inode = file_inode(filp);
+ int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
+ int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2369,14 +2418,22 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
goto bail;
}
+ if (!direct_io && nowait)
+ return -EOPNOTSUPP;
+
/*
* buffered reads protect themselves in ->readpage(). O_DIRECT reads
* need locks to protect pending reads from racing with truncate.
*/
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = ocfs2_rw_lock(inode, 0);
+ if (direct_io) {
+ if (nowait)
+ ret = ocfs2_try_rw_lock(inode, 0);
+ else
+ ret = ocfs2_rw_lock(inode, 0);
+
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto bail;
}
rw_level = 0;
@@ -2393,9 +2450,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
* like i_size. This allows the checks down below
* generic_file_aio_read() a chance of actually working.
*/
- ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
+ ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
+ !nowait);
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto bail;
}
ocfs2_inode_unlock(inode, lock_level);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 36304434eacf..e5dcea6cee5f 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -666,23 +666,24 @@ static int __ocfs2_journal_access(handle_t *handle,
/* we can safely remove this assertion after testing. */
if (!buffer_uptodate(bh)) {
mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
- mlog(ML_ERROR, "b_blocknr=%llu\n",
- (unsigned long long)bh->b_blocknr);
+ mlog(ML_ERROR, "b_blocknr=%llu, b_state=0x%lx\n",
+ (unsigned long long)bh->b_blocknr, bh->b_state);
lock_buffer(bh);
/*
- * A previous attempt to write this buffer head failed.
- * Nothing we can do but to retry the write and hope for
- * the best.
+ * A previous transaction with a couple of buffer heads fail
+ * to checkpoint, so all the bhs are marked as BH_Write_EIO.
+ * For current transaction, the bh is just among those error
+ * bhs which previous transaction handle. We can't just clear
+ * its BH_Write_EIO and reuse directly, since other bhs are
+ * not written to disk yet and that will cause metadata
+ * inconsistency. So we should set fs read-only to avoid
+ * further damage.
*/
if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) {
- clear_buffer_write_io_error(bh);
- set_buffer_uptodate(bh);
- }
-
- if (!buffer_uptodate(bh)) {
unlock_buffer(bh);
- return -EIO;
+ return ocfs2_error(osb->sb, "A previous attempt to "
+ "write this buffer head failed\n");
}
unlock_buffer(bh);
}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 098f5c712569..fb9a20e3d608 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -184,7 +184,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
int ret = 0, lock_level = 0;
ret = ocfs2_inode_lock_atime(file_inode(file),
- file->f_path.mnt, &lock_level);
+ file->f_path.mnt, &lock_level, 1);
if (ret < 0) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 9a50f222ac97..6867eef2e06b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -404,6 +404,7 @@ struct ocfs2_super
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
+ struct ocfs2_lock_res osb_trim_fs_lockres;
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index d277aabf5dfb..7051b994c776 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -50,6 +50,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_NFS_SYNC,
OCFS2_LOCK_TYPE_ORPHAN_SCAN,
OCFS2_LOCK_TYPE_REFCOUNT,
+ OCFS2_LOCK_TYPE_TRIM_FS,
OCFS2_NUM_LOCK_TYPES
};
@@ -93,6 +94,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_REFCOUNT:
c = 'T';
break;
+ case OCFS2_LOCK_TYPE_TRIM_FS:
+ c = 'I';
+ break;
default:
c = '\0';
}
@@ -115,6 +119,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
[OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
[OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
+ [OCFS2_LOCK_TYPE_TRIM_FS] = "TrimFs",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a0b5d00ef0a9..e2a11aaece10 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1449,20 +1449,22 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
TRACE_EVENT(ocfs2_prepare_inode_for_write,
TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
- unsigned long count),
- TP_ARGS(ino, saved_pos, count),
+ unsigned long count, int wait),
+ TP_ARGS(ino, saved_pos, count, wait),
TP_STRUCT__entry(
__field(unsigned long long, ino)
__field(unsigned long long, saved_pos)
__field(unsigned long, count)
+ __field(int, wait)
),
TP_fast_assign(
__entry->ino = ino;
__entry->saved_pos = saved_pos;
__entry->count = count;
+ __entry->wait = wait;
),
- TP_printk("%llu %llu %lu", __entry->ino,
- __entry->saved_pos, __entry->count)
+ TP_printk("%llu %llu %lu %d", __entry->ino,
+ __entry->saved_pos, __entry->count, __entry->wait)
);
DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 9f0b95abc09f..d8f5f6ce99dc 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2438,6 +2438,8 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+ if (undo_fn)
+ jbd_unlock_bh_state(group_bh);
return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
(unsigned long long)le64_to_cpu(bg->bg_blkno),
le16_to_cpu(bg->bg_bits),
@@ -2563,16 +2565,16 @@ static int _ocfs2_free_clusters(handle_t *handle,
int status;
u16 bg_start_bit;
u64 bg_blkno;
- struct ocfs2_dinode *fe;
/* You can't ever have a contiguous set of clusters
* bigger than a block group bitmap so we never have to worry
* about looping on them.
* This is expensive. We can safely remove once this stuff has
* gotten tested really well. */
- BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
+ BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb,
+ ocfs2_blocks_to_clusters(bitmap_inode->i_sb,
+ start_blk)));
- fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
&bg_start_bit);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 80efa5699fb0..ffa4952d432b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -474,9 +474,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
if (!new) {
ocfs2_release_system_inodes(osb);
- status = -EINVAL;
+ status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL;
mlog_errno(status);
- /* FIXME: Should ERROR_RO_FS */
mlog(ML_ERROR, "Unable to load system inode %d, "
"possibly corrupt fs?", i);
goto bail;
@@ -505,7 +504,7 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
if (!new) {
ocfs2_release_system_inodes(osb);
- status = -EINVAL;
+ status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL;
mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
status, i, osb->slot_num);
goto bail;
@@ -1208,14 +1207,15 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
read_super_error:
brelse(bh);
+ if (status)
+ mlog_errno(status);
+
if (osb) {
atomic_set(&osb->vol_state, VOLUME_DISABLED);
wake_up(&osb->osb_mount_event);
ocfs2_dismount_volume(sb, 1);
}
- if (status)
- mlog_errno(status);
return status;
}
@@ -1843,6 +1843,9 @@ static int ocfs2_mount_volume(struct super_block *sb)
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
+ if (status == -EBADR && ocfs2_userspace_stack(osb))
+ mlog(ML_ERROR, "couldn't mount because cluster name on"
+ " disk does not match the running cluster name.\n");
goto leave;
}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index c5898c59d411..c261c1dfd374 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -638,14 +638,17 @@ int ocfs2_calc_xattr_init(struct inode *dir,
si->value_len);
if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ down_read(&OCFS2_I(dir)->ip_xattr_sem);
acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
"", NULL, 0);
+ up_read(&OCFS2_I(dir)->ip_xattr_sem);
if (acl_len > 0) {
a_size = ocfs2_xattr_entry_real_size(0, acl_len);
if (S_ISDIR(mode))
a_size <<= 1;
} else if (acl_len != 0 && acl_len != -ENODATA) {
+ ret = acl_len;
mlog_errno(ret);
return ret;
}
@@ -6415,7 +6418,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle,
* and then insert the extents one by one.
*/
if (xv->xr_list.l_tree_depth) {
- memcpy(new_xv, &def_xv, sizeof(def_xv));
+ memcpy(new_xv, &def_xv, OCFS2_XATTR_ROOT_SIZE);
vb->vb_xv = new_xv;
vb->vb_bh = value_bh;
ocfs2_init_xattr_value_extent_tree(&data_et,