From 385fe0bede52a45cd960f30c7eb8d20ad8e1e05b Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 1 Oct 2013 23:49:49 +0800 Subject: Btrfs: fix crash of compressed writes The crash[1] is found by xfstests/generic/208 with "-o compress", it's not reproduced everytime, but it does panic. The bug is quite interesting, it's actually introduced by a recent commit (573aecafca1cf7a974231b759197a1aebcf39c2a, Btrfs: actually limit the size of delalloc range). Btrfs implements delay allocation, so during writeback, we (1) get a page A and lock it (2) search the state tree for delalloc bytes and lock all pages within the range (3) process the delalloc range, including find disk space and create ordered extent and so on. (4) submit the page A. It runs well in normal cases, but if we're in a racy case, eg. buffered compressed writes and aio-dio writes, sometimes we may fail to lock all pages in the 'delalloc' range, in which case, we need to fall back to search the state tree again with a smaller range limit(max_bytes = PAGE_CACHE_SIZE - offset). The mentioned commit has a side effect, that is, in the fallback case, we can find delalloc bytes before the index of the page we already have locked, so we're in the case of (delalloc_end <= *start) and return with (found > 0). This ends with not locking delalloc pages but making ->writepage still process them, and the crash happens. This fixes it by just thinking that we find nothing and returning to caller as the caller knows how to deal with it properly. [1]: ------------[ cut here ]------------ kernel BUG at mm/page-writeback.c:2170! [...] CPU: 2 PID: 11755 Comm: btrfs-delalloc- Tainted: G O 3.11.0+ #8 [...] RIP: 0010:[] [] clear_page_dirty_for_io+0x1e/0x83 [...] [ 4934.248731] Stack: [ 4934.248731] ffff8801477e5dc8 ffffea00049b9f00 ffff8801869f9ce8 ffffffffa02b841a [ 4934.248731] 0000000000000000 0000000000000000 0000000000000fff 0000000000000620 [ 4934.248731] ffff88018db59c78 ffffea0005da8d40 ffffffffa02ff860 00000001810016c0 [ 4934.248731] Call Trace: [ 4934.248731] [] extent_range_clear_dirty_for_io+0xcf/0xf5 [btrfs] [ 4934.248731] [] compress_file_range+0x1dc/0x4cb [btrfs] [ 4934.248731] [] ? detach_if_pending+0x22/0x4b [ 4934.248731] [] async_cow_start+0x35/0x53 [btrfs] [ 4934.248731] [] worker_loop+0x14b/0x48c [btrfs] [ 4934.248731] [] ? btrfs_queue_worker+0x25c/0x25c [btrfs] [ 4934.248731] [] kthread+0x8d/0x95 [ 4934.248731] [] ? kthread_freezable_should_stop+0x43/0x43 [ 4934.248731] [] ret_from_fork+0x7c/0xb0 [ 4934.248731] [] ? kthread_freezable_should_stop+0x43/0x43 [ 4934.248731] Code: ff 85 c0 0f 94 c0 0f b6 c0 59 5b 5d c3 0f 1f 44 00 00 55 48 89 e5 41 54 53 48 89 fb e8 2c de 00 00 49 89 c4 48 8b 03 a8 01 75 02 <0f> 0b 4d 85 e4 74 52 49 8b 84 24 80 00 00 00 f6 40 20 01 75 44 [ 4934.248731] RIP [] clear_page_dirty_for_io+0x1e/0x83 [ 4934.248731] RSP [ 4934.280307] ---[ end trace 36f06d3f8750236a ]--- Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c09a40db53d..43feb4663f5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1614,7 +1614,7 @@ again: *start = delalloc_start; *end = delalloc_end; free_extent_state(cached_state); - return found; + return 0; } /* -- cgit v1.2.3-70-g09d2 From b208c2f7ceafacbc44f13d1b5a9fbada98226183 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 19 Sep 2013 20:37:07 -0700 Subject: btrfs: Fix crash due to not allocating integrity data for a bioset When btrfs creates a bioset, we must also allocate the integrity data pool. Otherwise btrfs will crash when it tries to submit a bio to a checksumming disk: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: [] mempool_alloc+0x4a/0x150 PGD 2305e4067 PUD 23063d067 PMD 0 Oops: 0000 [#1] PREEMPT SMP Modules linked in: btrfs scsi_debug xfs ext4 jbd2 ext3 jbd mbcache sch_fq_codel eeprom lpc_ich mfd_core nfsd exportfs auth_rpcgss af_packet raid6_pq xor zlib_deflate libcrc32c [last unloaded: scsi_debug] CPU: 1 PID: 4486 Comm: mount Not tainted 3.12.0-rc1-mcsum #2 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff8802451c9720 ti: ffff880230698000 task.ti: ffff880230698000 RIP: 0010:[] [] mempool_alloc+0x4a/0x150 RSP: 0018:ffff880230699688 EFLAGS: 00010286 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 00000000005f8445 RDX: 0000000000000001 RSI: 0000000000000010 RDI: 0000000000000000 RBP: ffff8802306996f8 R08: 0000000000011200 R09: 0000000000000008 R10: 0000000000000020 R11: ffff88009d6e8000 R12: 0000000000011210 R13: 0000000000000030 R14: ffff8802306996b8 R15: ffff8802451c9720 FS: 00007f25b8a16800(0000) GS:ffff88024fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000018 CR3: 0000000230576000 CR4: 00000000000007e0 Stack: ffff8802451c9720 0000000000000002 ffffffff81a97100 0000000000281250 ffffffff81a96480 ffff88024fc99150 ffff880228d18200 0000000000000000 0000000000000000 0000000000000040 ffff880230e8c2e8 ffff8802459dc900 Call Trace: [] bio_integrity_alloc+0x48/0x1b0 [] bio_integrity_prep+0xac/0x360 [] ? mempool_alloc+0x58/0x150 [] ? alloc_extent_state+0x31/0x110 [btrfs] [] blk_queue_bio+0x1c9/0x460 [] generic_make_request+0xca/0x100 [] submit_bio+0x79/0x160 [] btrfs_map_bio+0x48e/0x5b0 [btrfs] [] btree_submit_bio_hook+0xda/0x110 [btrfs] [] submit_one_bio+0x6a/0xa0 [btrfs] [] read_extent_buffer_pages+0x250/0x310 [btrfs] [] ? __radix_tree_preload+0x66/0xf0 [] ? radix_tree_insert+0x95/0x260 [] btree_read_extent_buffer_pages.constprop.128+0xb6/0x120 [btrfs] [] read_tree_block+0x3a/0x60 [btrfs] [] open_ctree+0x139d/0x2030 [btrfs] [] btrfs_mount+0x53a/0x7d0 [btrfs] [] ? pcpu_alloc+0x8eb/0x9f0 [] ? __kmalloc_track_caller+0x35/0x1e0 [] mount_fs+0x20/0xd0 [] vfs_kern_mount+0x76/0x120 [] do_mount+0x200/0xa40 [] ? strndup_user+0x5b/0x80 [] SyS_mount+0x90/0xe0 [] system_call_fastpath+0x1a/0x1f Code: 4c 8d 75 a8 4c 89 6d e8 45 89 e0 4c 8d 6f 30 48 89 5d d8 41 83 e0 af 48 89 fb 49 83 c6 18 4c 89 7d f8 65 4c 8b 3c 25 c0 b8 00 00 <48> 8b 73 18 44 89 c7 44 89 45 98 ff 53 20 48 85 c0 48 89 c2 74 RIP [] mempool_alloc+0x4a/0x150 RSP CR2: 0000000000000018 ---[ end trace 7a96042017ed21e2 ]--- Signed-off-by: Darrick J. Wong Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 43feb4663f5..22bda32acb8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -145,8 +145,16 @@ int __init extent_io_init(void) offsetof(struct btrfs_io_bio, bio)); if (!btrfs_bioset) goto free_buffer_cache; + + if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) + goto free_bioset; + return 0; +free_bioset: + bioset_free(btrfs_bioset); + btrfs_bioset = NULL; + free_buffer_cache: kmem_cache_destroy(extent_buffer_cache); extent_buffer_cache = NULL; -- cgit v1.2.3-70-g09d2 From 7bf811a595a895b7a886dcf218d0d34f97df76dc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 7 Oct 2013 22:11:09 -0400 Subject: Btrfs: limit delalloc pages outside of find_delalloc_range Liu fixed part of this problem and unfortunately I steered him in slightly the wrong direction and so didn't completely fix the problem. The problem is we limit the size of the delalloc range we are looking for to max bytes and then we try to lock that range. If we fail to lock the pages in that range we will shrink the max bytes to a single page and re loop. However if our first page is inside of the delalloc range then we will end up limiting the end of the range to a period before our first page. This is illustrated below [0 -------- delalloc range --------- 256mb] [page] So find_delalloc_range will return with delalloc_start as 0 and end as 128mb, and then we will notice that delalloc_start < *start and adjust it up, but not adjust delalloc_end up, so things go sideways. To fix this we need to not limit the max bytes in find_delalloc_range, but in find_lock_delalloc_range and that way we don't end up with this confusion. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 43feb4663f5..d8ea0cb200b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1482,10 +1482,8 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, cur_start = state->end + 1; node = rb_next(node); total_bytes += state->end - state->start + 1; - if (total_bytes >= max_bytes) { - *end = *start + max_bytes - 1; + if (total_bytes >= max_bytes) break; - } if (!node) break; } @@ -1627,10 +1625,9 @@ again: /* * make sure to limit the number of pages we try to lock down - * if we're looping. */ - if (delalloc_end + 1 - delalloc_start > max_bytes && loops) - delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; + if (delalloc_end + 1 - delalloc_start > max_bytes) + delalloc_end = delalloc_start + max_bytes - 1; /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, @@ -1641,8 +1638,7 @@ again: */ free_extent_state(cached_state); if (!loops) { - unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); - max_bytes = PAGE_CACHE_SIZE - offset; + max_bytes = PAGE_CACHE_SIZE; loops = 1; goto again; } else { -- cgit v1.2.3-70-g09d2