From 14543774bd67a64f616431e5c9d1472f58979841 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 24 Nov 2015 16:23:54 +0000 Subject: Btrfs: fix error path when failing to submit bio for direct IO write Commit 61de718fceb6 ("Btrfs: fix memory corruption on failure to submit bio for direct IO") fixed problems with the error handling code after we fail to submit a bio for direct IO. However there were 2 problems that it did not address when the failure is due to memory allocation failures for direct IO writes: 1) We considered that there could be only one ordered extent for the whole IO range, which is not always true, as we can have multiple; 2) It did not set the bit BTRFS_ORDERED_IO_DONE in the ordered extent, which can make other tasks running btrfs_wait_logged_extents() hang forever, since they wait for that bit to be set. The general assumption is that regardless of an error, the BTRFS_ORDERED_IO_DONE is always set and it precedes setting the bit BTRFS_ORDERED_COMPLETE. Fix these issues by moving part of the btrfs_endio_direct_write() handler into a new helper function and having that new helper function called when we fail to allocate memory to submit the bio (and its private object) for a direct IO write. Signed-off-by: Filipe Manana Reviewed-by: Liu Bo --- fs/btrfs/inode.c | 54 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f82d1f4460dd..66106b48b478 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7995,22 +7995,22 @@ static void btrfs_endio_direct_read(struct bio *bio) bio_put(bio); } -static void btrfs_endio_direct_write(struct bio *bio) +static void btrfs_endio_direct_write_update_ordered(struct inode *inode, + const u64 offset, + const u64 bytes, + const int uptodate) { - struct btrfs_dio_private *dip = bio->bi_private; - struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered = NULL; - u64 ordered_offset = dip->logical_offset; - u64 ordered_bytes = dip->bytes; - struct bio *dio_bio; + u64 ordered_offset = offset; + u64 ordered_bytes = bytes; int ret; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, ordered_bytes, - !bio->bi_error); + uptodate); if (!ret) goto out_test; @@ -8023,13 +8023,22 @@ out_test: * our bio might span multiple ordered extents. If we haven't * completed the accounting for the whole dio, go back and try again */ - if (ordered_offset < dip->logical_offset + dip->bytes) { - ordered_bytes = dip->logical_offset + dip->bytes - - ordered_offset; + if (ordered_offset < offset + bytes) { + ordered_bytes = offset + bytes - ordered_offset; ordered = NULL; goto again; } - dio_bio = dip->dio_bio; +} + +static void btrfs_endio_direct_write(struct bio *bio) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct bio *dio_bio = dip->dio_bio; + + btrfs_endio_direct_write_update_ordered(dip->inode, + dip->logical_offset, + dip->bytes, + !bio->bi_error); kfree(dip); @@ -8365,24 +8374,15 @@ free_ordered: dip = NULL; io_bio = NULL; } else { - if (write) { - struct btrfs_ordered_extent *ordered; - - ordered = btrfs_lookup_ordered_extent(inode, - file_offset); - set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); - /* - * Decrements our ref on the ordered extent and removes - * the ordered extent from the inode's ordered tree, - * doing all the proper resource cleanup such as for the - * reserved space and waking up any waiters for this - * ordered extent (through btrfs_remove_ordered_extent). - */ - btrfs_finish_ordered_io(ordered); - } else { + if (write) + btrfs_endio_direct_write_update_ordered(inode, + file_offset, + dio_bio->bi_iter.bi_size, + 0); + else unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - } + dio_bio->bi_error = -EIO; /* * Releases and cleans up our dio_bio, no need to bio_put() -- cgit From b850ae14278dfc49c3a03b39357214fc79330db9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Dec 2015 16:23:16 +0000 Subject: Btrfs: fix deadlock between direct IO write and defrag/readpages If readpages() (triggered by defrag or buffered reads) is called while a direct IO write is in progress, we have a small time window where we can deadlock, resulting in traces like the following being generated: [84723.212993] INFO: task fio:2849 blocked for more than 120 seconds. [84723.214310] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1 [84723.215640] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [84723.217313] fio D ffff88023ec75218 0 2849 2835 0x00000000 [84723.218778] ffff880122dfb6e8 0000000000000092 0000000000000000 ffff88023ec75200 [84723.220458] ffff88000e05d2c0 ffff880122dfc000 ffff88023ec75200 7fffffffffffffff [84723.230597] 0000000000000002 ffffffff8147891a ffff880122dfb700 ffffffff8147856a [84723.232085] Call Trace: [84723.232625] [] ? bit_wait+0x3c/0x3c [84723.233529] [] schedule+0x7d/0x95 [84723.234398] [] schedule_timeout+0x43/0x10b [84723.235384] [] ? time_hardirqs_on+0x15/0x28 [84723.236426] [] ? trace_hardirqs_on+0xd/0xf [84723.237502] [] ? read_seqcount_begin.constprop.20+0x57/0x6d [84723.238807] [] ? trace_hardirqs_on_caller+0x16/0x1ab [84723.242012] [] ? trace_hardirqs_on+0xd/0xf [84723.243064] [] ? timekeeping_get_ns+0xe/0x33 [84723.244116] [] ? ktime_get+0x41/0x52 [84723.245029] [] io_schedule_timeout+0xb7/0x12b [84723.245942] [] ? io_schedule_timeout+0xb7/0x12b [84723.246596] [] bit_wait_io+0x39/0x45 [84723.247503] [] __wait_on_bit_lock+0x49/0x8d [84723.248540] [] __lock_page+0x66/0x68 [84723.249558] [] ? autoremove_wake_function+0x3a/0x3a [84723.250844] [] lock_page+0x2c/0x2f [84723.251871] [] invalidate_inode_pages2_range+0xf5/0x2aa [84723.253274] [] ? filemap_fdatawait_range+0x12d/0x146 [84723.254757] [] ? filemap_fdatawrite_range+0x13/0x15 [84723.256378] [] btrfs_get_blocks_direct+0x1b0/0x664 [btrfs] [84723.258556] [] ? submit_page_section+0x7b/0x111 [84723.260064] [] do_blockdev_direct_IO+0x658/0xbdb [84723.261479] [] ? btrfs_page_exists_in_range+0x1a9/0x1a9 [btrfs] [84723.262961] [] ? btrfs_writepage_start_hook+0xce/0xce [btrfs] [84723.264449] [] __blockdev_direct_IO+0x31/0x33 [84723.265614] [] ? __blockdev_direct_IO+0x31/0x33 [84723.266769] [] ? btrfs_writepage_start_hook+0xce/0xce [btrfs] [84723.268264] [] btrfs_direct_IO+0x1b9/0x259 [btrfs] [84723.270954] [] ? btrfs_writepage_start_hook+0xce/0xce [btrfs] [84723.272465] [] generic_file_direct_write+0xb3/0x128 [84723.273734] [] btrfs_file_write_iter+0x228/0x404 [btrfs] [84723.275101] [] __vfs_write+0x7c/0xa5 [84723.276200] [] vfs_write+0xa0/0xe4 [84723.277298] [] SyS_write+0x50/0x7e [84723.278327] [] entry_SYSCALL_64_fastpath+0x12/0x6f [84723.279595] INFO: lockdep is turned off. [84723.379035] INFO: task btrfs:2923 blocked for more than 120 seconds. [84723.380323] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1 [84723.381608] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [84723.383003] btrfs D ffff88023ed75218 0 2923 2859 0x00000000 [84723.384277] ffff88001311f860 0000000000000082 ffff88001311f840 ffff88023ed75200 [84723.385748] ffff88012c6751c0 ffff880013120000 ffff88012042fe68 ffff88012042fe30 [84723.387152] ffff880221571c88 0000000000000001 ffff88001311f878 ffffffff8147856a [84723.388620] Call Trace: [84723.389105] [] schedule+0x7d/0x95 [84723.391882] [] btrfs_start_ordered_extent+0x161/0x1fa [btrfs] [84723.393718] [] ? signal_pending_state+0x31/0x31 [84723.395659] [] __do_contiguous_readpages.constprop.21+0x81/0xdc [btrfs] [84723.397383] [] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs] [84723.398852] [] __extent_readpages.constprop.20+0xed/0x100 [btrfs] [84723.400561] [] ? __lru_cache_add+0x5d/0x72 [84723.401787] [] extent_readpages+0x111/0x1a7 [btrfs] [84723.403121] [] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs] [84723.404583] [] btrfs_readpages+0x1f/0x21 [btrfs] [84723.406007] [] __do_page_cache_readahead+0x168/0x1f4 [84723.407502] [] ondemand_readahead+0x21d/0x22e [84723.408937] [] ? ondemand_readahead+0x21d/0x22e [84723.410487] [] page_cache_sync_readahead+0x3d/0x3f [84723.411710] [] btrfs_defrag_file+0x419/0xaaf [btrfs] [84723.413007] [] ? kzalloc+0xf/0x11 [btrfs] [84723.414085] [] btrfs_ioctl_defrag+0x125/0x14e [btrfs] [84723.415307] [] btrfs_ioctl+0x746/0x24c6 [btrfs] [84723.416532] [] ? arch_local_irq_save+0x9/0xc [84723.417731] [] ? __might_fault+0x4c/0xa7 [84723.418699] [] ? __might_fault+0x4c/0xa7 [84723.421532] [] ? __might_fault+0xa5/0xa7 [84723.422629] [] ? cp_new_stat+0x15d/0x174 [84723.423712] [] do_vfs_ioctl+0x427/0x4e6 [84723.424801] [] ? SYSC_newfstat+0x25/0x2e [84723.425968] [] ? __fget_light+0x4d/0x71 [84723.427063] [] SyS_ioctl+0x57/0x79 [84723.428138] [] entry_SYSCALL_64_fastpath+0x12/0x6f Consider the following logical and physical file layout: logical: ... [ prealloc extent A ] [ prealloc extent B ] [ extent C ] ... 4K 8K 16K physical: ... 12853248 12857344 1103101952 ... (= 12853248 + 4K) Extents A and B are physically adjacent. The following diagram shows a sequence of events that lead to the deadlock when we attempt to do a direct IO write against the file range [4K, 16K[ and a defrag is triggered simultaneously. CPU 1 CPU 2 btrfs_direct_IO() btrfs_get_blocks_direct() creates ordered extent A, covering the 4k prealloc extent A (range [4K, 8K[) btrfs_defrag_file() page_cache_sync_readahead([0K, 1M[) btrfs_readpages() extent_readpages() locks all pages in the file range [0K, 128K[ through calls to add_to_page_cache_lru() __do_contiguous_readpages() finds ordered extent A waits for it to complete btrfs_get_blocks_direct() called again lock_extent_direct(range [8K, 16K[) finds a page in range [8K, 16K[ through btrfs_page_exists_in_range() invalidate_inode_pages2_range([8K, 16K[) --> tries to lock pages that are already locked by the task at CPU 2 --> our task, running __blockdev_direct_IO(), hangs waiting to lock the pages and the submit bio callback, btrfs_submit_direct(), ends up never being called, resulting in the ordered extent A never completing (because a corresponding bio is never submitted) and CPU 2 will wait for it forever while holding the pages locked ---> deadlock! Fix this by removing the page invalidation approach when attempting to lock the range for IO from the callback btrfs_get_blocks_direct() and falling back buffered IO. This was a rare case anyway and well behaved applications do not mix concurrent direct IO writes with buffered reads anyway, being a concurrent defrag the only normal case that could lead to the deadlock. Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 66106b48b478..269da826a40c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7411,25 +7411,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } else { - /* Screw you mmap */ - ret = btrfs_fdatawrite_range(inode, lockstart, lockend); - if (ret) - break; - ret = filemap_fdatawait_range(inode->i_mapping, - lockstart, - lockend); - if (ret) - break; - /* - * If we found a page that couldn't be invalidated just - * fall back to buffered. + * We could trigger writeback for this range (and wait + * for it to complete) and then invalidate the pages for + * this range (through invalidate_inode_pages2_range()), + * but that can lead us to a deadlock with a concurrent + * call to readpages() (a buffered read or a defrag call + * triggered a readahead) on a page lock due to an + * ordered dio extent we created before but did not have + * yet a corresponding bio submitted (whence it can not + * complete), which makes readpages() wait for that + * ordered extent to complete while holding a lock on + * that page. */ - ret = invalidate_inode_pages2_range(inode->i_mapping, - lockstart >> PAGE_CACHE_SHIFT, - lockend >> PAGE_CACHE_SHIFT); - if (ret) - break; + ret = -ENOTBLK; + break; } cond_resched(); -- cgit From f28a492878170f39002660a26c329201cf678d74 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Dec 2015 19:23:20 +0000 Subject: Btrfs: fix leaking of ordered extents after direct IO write error When doing a direct IO write, __blockdev_direct_IO() can call the btrfs_get_blocks_direct() callback one or more times before it calls the btrfs_submit_direct() callback. However it can fail after calling the first callback and before calling the second callback, which is a problem because the first one creates ordered extents and the second one is the one that submits bios that cover the ordered extents created by the first one. That means the ordered extents will never complete nor have any of the flags BTRFS_ORDERED_IO_DONE / BTRFS_ORDERED_IOERR set, resulting in subsequent operations (such as other direct IO writes, buffered writes or hole punching) that lock the same IO range and lookup for ordered extents in the range to hang forever waiting for those ordered extents because they can not complete ever, since no bio was submitted. Fix this by tracking a range of created ordered extents that don't have yet corresponding bios submitted and completing the ordered extents in the range if __blockdev_direct_IO() fails with an error. Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 269da826a40c..1436799b763b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -66,6 +66,13 @@ struct btrfs_iget_args { struct btrfs_root *root; }; +struct btrfs_dio_data { + u64 outstanding_extents; + u64 reserve; + u64 unsubmitted_oe_range_start; + u64 unsubmitted_oe_range_end; +}; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_dir_ro_inode_operations; @@ -7481,11 +7488,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, return em; } -struct btrfs_dio_data { - u64 outstanding_extents; - u64 reserve; -}; - static void adjust_dio_outstanding_extents(struct inode *inode, struct btrfs_dio_data *dio_data, const u64 len) @@ -7669,6 +7671,7 @@ unlock: btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; + dio_data->unsubmitted_oe_range_end = start + len; current->journal_info = dio_data; } @@ -8342,6 +8345,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->subio_endio = btrfs_subio_endio_read; } + /* + * Reset the range for unsubmitted ordered extents (to a 0 length range) + * even if we fail to submit a bio, because in such case we do the + * corresponding error handling below and it must not be done a second + * time by btrfs_direct_IO(). + */ + if (write) { + struct btrfs_dio_data *dio_data = current->journal_info; + + dio_data->unsubmitted_oe_range_end = dip->logical_offset + + dip->bytes; + dio_data->unsubmitted_oe_range_start = + dio_data->unsubmitted_oe_range_end; + } + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; @@ -8478,6 +8496,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * originally calculated. Abuse current->journal_info for this. */ dio_data.reserve = round_up(count, root->sectorsize); + dio_data.unsubmitted_oe_range_start = (u64)offset; + dio_data.unsubmitted_oe_range_end = (u64)offset; current->journal_info = &dio_data; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { @@ -8496,6 +8516,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (dio_data.reserve) btrfs_delalloc_release_space(inode, offset, dio_data.reserve); + /* + * On error we might have left some ordered extents + * without submitting corresponding bios for them, so + * cleanup them up to avoid other tasks getting them + * and waiting for them to complete forever. + */ + if (dio_data.unsubmitted_oe_range_start < + dio_data.unsubmitted_oe_range_end) + btrfs_endio_direct_write_update_ordered(inode, + dio_data.unsubmitted_oe_range_start, + dio_data.unsubmitted_oe_range_end - + dio_data.unsubmitted_oe_range_start, + 0); } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, offset, count - (size_t)ret); -- cgit