f2fs: fix to avoid broken of dnode block list

f2fs recovery flow is relying on dnode block link list, it means fsynced file recovery depends on previous dnode's persistence in the list, so during fsync() we should wait on all regular inode's dnode writebacked before issuing flush. By this way, we can avoid dnode block list being broken by out-of-order IO submission due to IO scheduler or driver. Sheng Yong helps to do the test with this patch: Target:/data (f2fs, -) 64MB / 32768KB / 4KB / 8 1 / PERSIST / Index Base: SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS) 1 867.82 204.15 41440.03 41370.54 680.8 1025.94 1031.08 2 871.87 205.87 41370.3 40275.2 791.14 1065.84 1101.7 3 866.52 205.69 41795.67 40596.16 694.69 1037.16 1031.48 Avg 868.7366667 205.2366667 41535.33333 40747.3 722.21 1042.98 1054.753333 After: SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS) 1 798.81 202.5 41143 40613.87 602.71 838.08 913.83 2 805.79 206.47 40297.2 41291.46 604.44 840.75 924.27 3 814.83 206.17 41209.57 40453.62 602.85 834.66 927.91 Avg 806.4766667 205.0466667 40883.25667 40786.31667 603.3333333 837.83 922.0033333 Patched/Original: 0.928332713 0.999074239 0.984300676 1.000957528 0.835398753 0.803303994 0.874141189 It looks like atomic write will suffer performance regression. I suspect that the criminal is that we forcing to wait all dnode being in storage cache before we issue PREFLUSH+FUA. BTW, will commit ("f2fs: don't need to wait for node writes for atomic write") cause the problem: we will lose data of last transaction after SPO, even if atomic write return no error: - atomic_open(); - write() P1, P2, P3; - atomic_commit(); - writeback data: P1, P2, P3; - writeback node: N1, N2, N3; <--- If N1, N2 is not writebacked, N3 with fsync_mark is writebacked, In SPOR, we won't find N3 since node chain is broken, turns out that losing last transaction. - preflush + fua; - power-cut If we don't wait dnode writeback for atomic_write: SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS) 1 779.91 206.03 41621.5 40333.16 716.9 1038.21 1034.85 2 848.51 204.35 40082.44 39486.17 791.83 1119.96 1083.77 3 772.12 206.27 41335.25 41599.65 723.29 1055.07 971.92 Avg 800.18 205.55 41013.06333 40472.99333 744.0066667 1071.08 1030.18 Patched/Original: 0.92108464 1.001526693 0.987425886 0.993268102 1.030180511 1.026942031 0.976702294 SQLite's performance recovers. Jaegeuk: "Practically, I don't see db corruption becase of this. We can excuse to lose the last transaction." Finally, we decide to keep original implementation of atomic write interface sematics that we don't wait all dnode writeback before preflush+fua submission. Signed-off-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
author: Chao Yu <yuchao0@huawei.com> 2018-08-02 23:03:19 +0800
committer: Jaegeuk Kim <jaegeuk@kernel.org> 2018-08-10 16:19:05 -0700
commit: 50fa53eccf9f911a5b435248a2b0bd484fd82e5e (patch)
tree: f2fbb65982fcd3dfc251509a2e1800207ae7ad3e /fs/f2fs/node.c
parent: 6e45f2a59ffb440f28719ab3a68bee5b8e9df16b (diff)
1 files changed, 120 insertions, 24 deletions
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 21ffb784764c..81fb2f3edb52 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -28,6 +28,7 @@
 static struct kmem_cache *nat_entry_slab;
 static struct kmem_cache *free_nid_slab;
 static struct kmem_cache *nat_entry_set_slab;
+static struct kmem_cache *fsync_node_entry_slab;
 
 /*
  * Check whether the given nid is within node id range.
@@ -264,6 +265,72 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
 							start, nr);
 }
 
+bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
+{
+	return NODE_MAPPING(sbi) == page->mapping &&
+			IS_DNODE(page) && is_cold_node(page);
+}
+
+void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
+{
+	spin_lock_init(&sbi->fsync_node_lock);
+	INIT_LIST_HEAD(&sbi->fsync_node_list);
+	sbi->fsync_seg_id = 0;
+	sbi->fsync_node_num = 0;
+}
+
+static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
+							struct page *page)
+{
+	struct fsync_node_entry *fn;
+	unsigned long flags;
+	unsigned int seq_id;
+
+	fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
+
+	get_page(page);
+	fn->page = page;
+	INIT_LIST_HEAD(&fn->list);
+
+	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
+	list_add_tail(&fn->list, &sbi->fsync_node_list);
+	fn->seq_id = sbi->fsync_seg_id++;
+	seq_id = fn->seq_id;
+	sbi->fsync_node_num++;
+	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+
+	return seq_id;
+}
+
+void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page)
+{
+	struct fsync_node_entry *fn;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
+	list_for_each_entry(fn, &sbi->fsync_node_list, list) {
+		if (fn->page == page) {
+			list_del(&fn->list);
+			sbi->fsync_node_num--;
+			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+			kmem_cache_free(fsync_node_entry_slab, fn);
+			put_page(page);
+			return;
+		}
+	}
+	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+	f2fs_bug_on(sbi, 1);
+}
+
+void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
+	sbi->fsync_seg_id = 0;
+	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+}
+
 int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1388,7 +1455,7 @@ continue_unlock:
 
 static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 				struct writeback_control *wbc, bool do_balance,
-				enum iostat_type io_type)
+				enum iostat_type io_type, unsigned int *seq_id)
 {
 	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 	nid_t nid;
@@ -1405,6 +1472,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 		.io_type = io_type,
 		.io_wbc = wbc,
 	};
+	unsigned int seq;
 
 	trace_f2fs_writepage(page, NODE);
 
@@ -1450,6 +1518,13 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 
 	set_page_writeback(page);
 	ClearPageError(page);
+
+	if (f2fs_in_warm_node_list(sbi, page)) {
+		seq = f2fs_add_fsync_node_entry(sbi, page);
+		if (seq_id)
+			*seq_id = seq;
+	}
+
 	fio.old_blkaddr = ni.blk_addr;
 	f2fs_do_write_node_page(nid, &fio);
 	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
@@ -1497,7 +1572,7 @@ void f2fs_move_node_page(struct page *node_page, int gc_type)
 			goto out_page;
 
 		if (__write_node_page(node_page, false, NULL,
-					&wbc, false, FS_GC_NODE_IO))
+					&wbc, false, FS_GC_NODE_IO, NULL))
 			unlock_page(node_page);
 		goto release_page;
 	} else {
@@ -1514,11 +1589,13 @@ release_page:
 static int f2fs_write_node_page(struct page *page,
 				struct writeback_control *wbc)
 {
-	return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
+	return __write_node_page(page, false, NULL, wbc, false,
+						FS_NODE_IO, NULL);
 }
 
 int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
-			struct writeback_control *wbc, bool atomic)
+			struct writeback_control *wbc, bool atomic,
+			unsigned int *seq_id)
 {
 	pgoff_t index;
 	pgoff_t last_idx = ULONG_MAX;
@@ -1599,7 +1676,7 @@ continue_unlock:
 			ret = __write_node_page(page, atomic &&
 						page == last_page,
 						&submitted, wbc, true,
-						FS_NODE_IO);
+						FS_NODE_IO, seq_id);
 			if (ret) {
 				unlock_page(page);
 				f2fs_put_page(last_page, 0);
@@ -1716,7 +1793,7 @@ continue_unlock:
 			set_dentry_mark(page, 0);
 
 			ret = __write_node_page(page, false, &submitted,
-						wbc, do_balance, io_type);
+						wbc, do_balance, io_type, NULL);
 			if (ret)
 				unlock_page(page);
 			else if (submitted)
@@ -1749,35 +1826,46 @@ out:
 	return ret;
 }
 
-int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
+int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
+						unsigned int seq_id)
 {
-	pgoff_t index = 0;
-	struct pagevec pvec;
+	struct fsync_node_entry *fn;
+	struct page *page;
+	struct list_head *head = &sbi->fsync_node_list;
+	unsigned long flags;
+	unsigned int cur_seq_id = 0;
 	int ret2, ret = 0;
-	int nr_pages;
 
-	pagevec_init(&pvec);
+	while (seq_id && cur_seq_id < seq_id) {
+		spin_lock_irqsave(&sbi->fsync_node_lock, flags);
+		if (list_empty(head)) {
+			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+			break;
+		}
+		fn = list_first_entry(head, struct fsync_node_entry, list);
+		if (fn->seq_id > seq_id) {
+			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+			break;
+		}
+		cur_seq_id = fn->seq_id;
+		page = fn->page;
+		get_page(page);
+		spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
 
-	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_WRITEBACK))) {
-		int i;
+		f2fs_wait_on_page_writeback(page, NODE, true);
+		if (TestClearPageError(page))
+			ret = -EIO;
 
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
+		put_page(page);
 
-			if (ino && ino_of_node(page) == ino) {
-				f2fs_wait_on_page_writeback(page, NODE, true);
-				if (TestClearPageError(page))
-					ret = -EIO;
-			}
-		}
-		pagevec_release(&pvec);
-		cond_resched();
+		if (ret)
+			break;
 	}
 
 	ret2 = filemap_check_errors(NODE_MAPPING(sbi));
 	if (!ret)
 		ret = ret2;
+
 	return ret;
 }
 
@@ -2992,8 +3080,15 @@ int __init f2fs_create_node_manager_caches(void)
 			sizeof(struct nat_entry_set));
 	if (!nat_entry_set_slab)
 		goto destroy_free_nid;
+
+	fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
+			sizeof(struct fsync_node_entry));
+	if (!fsync_node_entry_slab)
+		goto destroy_nat_entry_set;
 	return 0;
 
+destroy_nat_entry_set:
+	kmem_cache_destroy(nat_entry_set_slab);
 destroy_free_nid:
 	kmem_cache_destroy(free_nid_slab);
 destroy_nat_entry:
@@ -3004,6 +3099,7 @@ fail:
 
 void f2fs_destroy_node_manager_caches(void)
 {
+	kmem_cache_destroy(fsync_node_entry_slab);
 	kmem_cache_destroy(nat_entry_set_slab);
 	kmem_cache_destroy(free_nid_slab);
 	kmem_cache_destroy(nat_entry_slab);
author	Chao Yu <yuchao0@huawei.com>	2018-08-02 23:03:19 +0800
committer	Jaegeuk Kim <jaegeuk@kernel.org>	2018-08-10 16:19:05 -0700
commit	50fa53eccf9f911a5b435248a2b0bd484fd82e5e (patch)
tree	f2fbb65982fcd3dfc251509a2e1800207ae7ad3e /fs/f2fs/node.c
parent	6e45f2a59ffb440f28719ab3a68bee5b8e9df16b (diff)