From 89c02e69fc5245f8a2f34b58b42d43a737af1a5e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Apr 2019 22:23:37 -0700 Subject: mm/memory_hotplug.c: drop memory device reference after find_memory_block() Right now we are using find_memory_block() to get the node id for the pfn range to online. We are missing to drop a reference to the memory block device. While the device still gets unregistered via device_unregister(), resulting in no user visible problem, the device is never released via device_release(), resulting in a memory leak. Fix that by properly using a put_device(). Link: http://lkml.kernel.org/r/20190411110955.1430-1-david@redhat.com Fixes: d0dc12e86b31 ("mm/memory_hotplug: optimize memory hotplug") Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Wei Yang Acked-by: Michal Hocko Acked-by: Pankaj Gupta Cc: David Hildenbrand Cc: Pavel Tatashin Cc: Qian Cai Cc: Arun KS Cc: Mathieu Malaterre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0082d699be94..b236069ff0d8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -874,6 +874,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ mem = find_memory_block(__pfn_to_section(pfn)); nid = mem->nid; + put_device(&mem->dev); /* associate pfn range with the zone */ zone = move_pfn_range(online_type, nid, pfn, nr_pages); -- cgit From e153abc0739ff77bd89c9ba1688cdb963464af97 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Thu, 25 Apr 2019 22:23:41 -0700 Subject: zram: pass down the bvec we need to read into in the work struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When scheduling work item to read page we need to pass down the proper bvec struct which points to the page to read into. Before this patch it uses a randomly initialized bvec (only if PAGE_SIZE != 4096) which is wrong. Note that without this patch on arch/kernel where PAGE_SIZE != 4096 userspace could read random memory through a zram block device (thought userspace probably would have no control on the address being read). Link: http://lkml.kernel.org/r/20190408183219.26377-1-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Andrew Morton Reviewed-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 399cad7daae7..d58a359a6622 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -774,18 +774,18 @@ struct zram_work { struct zram *zram; unsigned long entry; struct bio *bio; + struct bio_vec bvec; }; #if PAGE_SIZE != 4096 static void zram_sync_read(struct work_struct *work) { - struct bio_vec bvec; struct zram_work *zw = container_of(work, struct zram_work, work); struct zram *zram = zw->zram; unsigned long entry = zw->entry; struct bio *bio = zw->bio; - read_from_bdev_async(zram, &bvec, entry, bio); + read_from_bdev_async(zram, &zw->bvec, entry, bio); } /* @@ -798,6 +798,7 @@ static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, { struct zram_work work; + work.bvec = *bvec; work.zram = zram; work.entry = entry; work.bio = bio; -- cgit From ae3d6a323347940f0548bbb4b17f0bb2e9164169 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 25 Apr 2019 22:23:44 -0700 Subject: lib/Kconfig.debug: fix build error without CONFIG_BLOCK If CONFIG_TEST_KMOD is set to M, while CONFIG_BLOCK is not set, XFS and BTRFS can not be compiled successly. Link: http://lkml.kernel.org/r/20190410075434.35220-1-yuehaibing@huawei.com Fixes: d9c6a72d6fa2 ("kmod: add test driver to stress test the module loader") Signed-off-by: YueHaibing Reported-by: Hulk Robot Reviewed-by: Kees Cook Cc: Masahiro Yamada Cc: Petr Mladek Cc: Andy Shevchenko Cc: Matthew Wilcox Cc: Joe Lawrence Cc: Robin Murphy Cc: Luis Chamberlain Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 00dbcdbc9a0d..d5a4a4036d2f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1929,6 +1929,7 @@ config TEST_KMOD depends on m depends on BLOCK && (64BIT || LBDAF) # for XFS, BTRFS depends on NETDEVICES && NET_CORE && INET # for TUN + depends on BLOCK select TEST_LKM select XFS_FS select TUN -- cgit From e789803507b2e154ed452865ee981b038654e98d Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 25 Apr 2019 22:23:47 -0700 Subject: lib/test_vmalloc.c: do not create cpumask_t variable on stack On my "Intel(R) Xeon(R) W-2135 CPU @ 3.70GHz" system(12 CPUs) i get the warning from the compiler about frame size: warning: the frame size of 1096 bytes is larger than 1024 bytes [-Wframe-larger-than=] the size of cpumask_t depends on number of CPUs, therefore just make use of cpumask_of() in set_cpus_allowed_ptr() as a second argument. Link: http://lkml.kernel.org/r/20190418193925.9361-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Andrew Morton Reviewed-by: Roman Gushchin Cc: Uladzislau Rezki Cc: Michal Hocko Cc: Matthew Wilcox Cc: Thomas Garnier Cc: Oleksiy Avramchenko Cc: Steven Rostedt Cc: Joel Fernandes Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_vmalloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 83cdcaa82bf6..f832b095afba 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -383,14 +383,14 @@ static void shuffle_array(int *arr, int n) static int test_func(void *private) { struct test_driver *t = private; - cpumask_t newmask = CPU_MASK_NONE; int random_array[ARRAY_SIZE(test_case_array)]; int index, i, j, ret; ktime_t kt; u64 delta; - cpumask_set_cpu(t->cpu, &newmask); - set_cpus_allowed_ptr(current, &newmask); + ret = set_cpus_allowed_ptr(current, cpumask_of(t->cpu)); + if (ret < 0) + pr_err("Failed to set affinity to %d CPU\n", t->cpu); for (i = 0; i < ARRAY_SIZE(test_case_array); i++) random_array[i] = i; -- cgit From 24512228b7a3f412b5a51f189df302616b021c33 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Apr 2019 22:23:51 -0700 Subject: mm: do not boost watermarks to avoid fragmentation for the DISCONTIG memory model Mikulas Patocka reported that commit 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs") "broke" memory management on parisc. The machine is not NUMA but the DISCONTIG model creates three pgdats even though it's a UMA machine for the following ranges 0) Start 0x0000000000000000 End 0x000000003fffffff Size 1024 MB 1) Start 0x0000000100000000 End 0x00000001bfdfffff Size 3070 MB 2) Start 0x0000004040000000 End 0x00000040ffffffff Size 3072 MB Mikulas reported: With the patch 1c30844d2, the kernel will incorrectly reclaim the first zone when it fills up, ignoring the fact that there are two completely free zones. Basiscally, it limits cache size to 1GiB. For example, if I run: # dd if=/dev/sda of=/dev/null bs=1M count=2048 - with the proper kernel, there should be "Buffers - 2GiB" when this command finishes. With the patch 1c30844d2, buffers will consume just 1GiB or slightly more, because the kernel was incorrectly reclaiming them. The page allocator and reclaim makes assumptions that pgdats really represent NUMA nodes and zones represent ranges and makes decisions on that basis. Watermark boosting for small pgdats leads to unexpected results even though this would have behaved reasonably on SPARSEMEM. DISCONTIG is essentially deprecated and even parisc plans to move to SPARSEMEM so there is no need to be fancy, this patch simply disables watermark boosting by default on DISCONTIGMEM. Link: http://lkml.kernel.org/r/20190419094335.GJ18914@techsingularity.net Fixes: 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs") Signed-off-by: Mel Gorman Reported-by: Mikulas Patocka Tested-by: Mikulas Patocka Acked-by: Vlastimil Babka Cc: James Bottomley Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 16 ++++++++-------- mm/page_alloc.c | 13 +++++++++++++ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6af24cdb25cc..3f13d8599337 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -866,14 +866,14 @@ The intent is that compaction has less work to do in the future and to increase the success rate of future high-order allocations such as SLUB allocations, THP and hugetlbfs pages. -To make it sensible with respect to the watermark_scale_factor parameter, -the unit is in fractions of 10,000. The default value of 15,000 means -that up to 150% of the high watermark will be reclaimed in the event of -a pageblock being mixed due to fragmentation. The level of reclaim is -determined by the number of fragmentation events that occurred in the -recent past. If this value is smaller than a pageblock then a pageblocks -worth of pages will be reclaimed (e.g. 2MB on 64-bit x86). A boost factor -of 0 will disable the feature. +To make it sensible with respect to the watermark_scale_factor +parameter, the unit is in fractions of 10,000. The default value of +15,000 on !DISCONTIGMEM configurations means that up to 150% of the high +watermark will be reclaimed in the event of a pageblock being mixed due +to fragmentation. The level of reclaim is determined by the number of +fragmentation events that occurred in the recent past. If this value is +smaller than a pageblock then a pageblocks worth of pages will be reclaimed +(e.g. 2MB on 64-bit x86). A boost factor of 0 will disable the feature. ============================================================= diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c6ce20aaf80b..6c6d9f1c404e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -266,7 +266,20 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +#ifdef CONFIG_DISCONTIGMEM +/* + * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges + * are not on separate NUMA nodes. Functionally this works but with + * watermark_boost_factor, it can reclaim prematurely as the ranges can be + * quite small. By default, do not boost watermarks on discontigmem as in + * many cases very high-order allocations like THP are likely to be + * unsupported and the premature reclaim offsets the advantage of long-term + * fragmentation avoidance. + */ +int watermark_boost_factor __read_mostly; +#else int watermark_boost_factor __read_mostly = 15000; +#endif int watermark_scale_factor = 10; static unsigned long nr_kernel_pages __initdata; -- cgit From ee8ab0eeb49bd3982090c8f14dc9cc65bcd13c5c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Apr 2019 22:23:54 -0700 Subject: mm, page_alloc: always use a captured page regardless of compaction result During the development of commit 5e1f0f098b46 ("mm, compaction: capture a page under direct compaction"), a paranoid check was added to ensure that if a captured page was available after compaction that it was consistent with the final state of compaction. The intent was to catch serious programming bugs such as using a stale page pointer and causing corruption problems. However, it is possible to get a captured page even if compaction was unsuccessful if an interrupt triggered and happened to free pages in interrupt context that got merged into a suitable high-order page. It's highly unlikely but Li Wang did report the following warning on s390 occuring when testing OOM handling. Note that the warning is slightly edited for clarity. WARNING: CPU: 0 PID: 9783 at mm/page_alloc.c:3777 __alloc_pages_direct_compact+0x182/0x190 Modules linked in: rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache sunrpc pkey ghash_s390 prng xts aes_s390 des_s390 des_generic sha512_s390 zcrypt_cex4 zcrypt vmur binfmt_misc ip_tables xfs libcrc32c dasd_fba_mod qeth_l2 dasd_eckd_mod dasd_mod qeth qdio lcs ctcm ccwgroup fsm dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 9783 Comm: copy.sh Kdump: loaded Not tainted 5.1.0-rc 5 #1 This patch simply removes the check entirely instead of trying to be clever about pages freed from interrupt context. If a serious programming error was introduced, it is highly likely to be caught by prep_new_page() instead. Link: http://lkml.kernel.org/r/20190419085133.GH18914@techsingularity.net Fixes: 5e1f0f098b46 ("mm, compaction: capture a page under direct compaction") Signed-off-by: Mel Gorman Reported-by: Li Wang Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6c6d9f1c404e..d167c48d913c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3786,11 +3786,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); - if (*compact_result <= COMPACT_INACTIVE) { - WARN_ON_ONCE(page); - return NULL; - } - /* * At least in one zone compaction wasn't deferred or skipped, so let's * count a compaction stall -- cgit From 8139ad043d632c0e9e12d760068a7a8e91659aa1 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 25 Apr 2019 22:23:58 -0700 Subject: mm/page_alloc.c: avoid potential NULL pointer dereference ac.preferred_zoneref->zone passed to alloc_flags_nofragment() can be NULL. 'zone' pointer unconditionally derefernced in alloc_flags_nofragment(). Bail out on NULL zone to avoid potential crash. Currently we don't see any crashes only because alloc_flags_nofragment() has another bug which allows compiler to optimize away all accesses to 'zone'. Link: http://lkml.kernel.org/r/20190423120806.3503-1-aryabinin@virtuozzo.com Fixes: 6bb154504f8b ("mm, page_alloc: spread allocations across zones before introducing fragmentation") Signed-off-by: Andrey Ryabinin Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d167c48d913c..9992ca7f29f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3432,6 +3432,9 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) alloc_flags |= ALLOC_KSWAPD; #ifdef CONFIG_ZONE_DMA32 + if (!zone) + return alloc_flags; + if (zone_idx(zone) != ZONE_NORMAL) goto out; -- cgit From 8118b82eb756e271929697e8ada5f637dc443af1 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 25 Apr 2019 22:24:01 -0700 Subject: mm/page_alloc.c: fix never set ALLOC_NOFRAGMENT flag Commit 0a79cdad5eb2 ("mm: use alloc_flags to record if kswapd can wake") removed setting of the ALLOC_NOFRAGMENT flag. Bring it back. The runtime effect is that ALLOC_NOFRAGMENT behaviour is restored so that allocations are spread across local zones to avoid fragmentation due to mixing pageblocks as long as possible. Link: http://lkml.kernel.org/r/20190423120806.3503-2-aryabinin@virtuozzo.com Fixes: 0a79cdad5eb2 ("mm: use alloc_flags to record if kswapd can wake") Signed-off-by: Andrey Ryabinin Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9992ca7f29f1..c02cff1ed56e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3436,7 +3436,7 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) return alloc_flags; if (zone_idx(zone) != ZONE_NORMAL) - goto out; + return alloc_flags; /* * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and @@ -3445,9 +3445,9 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) */ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); if (nr_online_nodes > 1 && !populated_zone(--zone)) - goto out; + return alloc_flags; -out: + alloc_flags |= ALLOC_NOFRAGMENT; #endif /* CONFIG_ZONE_DMA32 */ return alloc_flags; } -- cgit From 89189557b47b35683a27c80ee78aef18248eefb4 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 25 Apr 2019 22:24:05 -0700 Subject: fs/proc/proc_sysctl.c: Fix a NULL pointer dereference Syzkaller report this: sysctl could not get directory: /net//bridge -12 kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN PTI CPU: 1 PID: 7027 Comm: syz-executor.0 Tainted: G C 5.1.0-rc3+ #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 RIP: 0010:__write_once_size include/linux/compiler.h:220 [inline] RIP: 0010:__rb_change_child include/linux/rbtree_augmented.h:144 [inline] RIP: 0010:__rb_erase_augmented include/linux/rbtree_augmented.h:186 [inline] RIP: 0010:rb_erase+0x5f4/0x19f0 lib/rbtree.c:459 Code: 00 0f 85 60 13 00 00 48 89 1a 48 83 c4 18 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 89 f2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 75 0c 00 00 4d 85 ed 4c 89 2e 74 ce 4c 89 ea 48 RSP: 0018:ffff8881bb507778 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: ffff8881f224b5b8 RCX: ffffffff818f3f6a RDX: 000000000000000a RSI: 0000000000000050 RDI: ffff8881f224b568 RBP: 0000000000000000 R08: ffffed10376a0ef4 R09: ffffed10376a0ef4 R10: 0000000000000001 R11: ffffed10376a0ef4 R12: ffff8881f224b558 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f3e7ce13700(0000) GS:ffff8881f7300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fd60fbe9398 CR3: 00000001cb55c001 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: erase_entry fs/proc/proc_sysctl.c:178 [inline] erase_header+0xe3/0x160 fs/proc/proc_sysctl.c:207 start_unregistering fs/proc/proc_sysctl.c:331 [inline] drop_sysctl_table+0x558/0x880 fs/proc/proc_sysctl.c:1631 get_subdir fs/proc/proc_sysctl.c:1022 [inline] __register_sysctl_table+0xd65/0x1090 fs/proc/proc_sysctl.c:1335 br_netfilter_init+0x68/0x1000 [br_netfilter] do_one_initcall+0xbc/0x47d init/main.c:901 do_init_module+0x1b5/0x547 kernel/module.c:3456 load_module+0x6405/0x8c10 kernel/module.c:3804 __do_sys_finit_module+0x162/0x190 kernel/module.c:3898 do_syscall_64+0x9f/0x450 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Modules linked in: br_netfilter(+) backlight comedi(C) hid_sensor_hub max3100 ti_ads8688 udc_core fddi snd_mona leds_gpio rc_streamzap mtd pata_netcell nf_log_common rc_winfast udp_tunnel snd_usbmidi_lib snd_usb_toneport snd_usb_line6 snd_rawmidi snd_seq_device snd_hwdep videobuf2_v4l2 videobuf2_common videodev media videobuf2_vmalloc videobuf2_memops rc_gadmei_rm008z 8250_of smm665 hid_tmff hid_saitek hwmon_vid rc_ati_tv_wonder_hd_600 rc_core pata_pdc202xx_old dn_rtmsg as3722 ad714x_i2c ad714x snd_soc_cs4265 hid_kensington panel_ilitek_ili9322 drm drm_panel_orientation_quirks ipack cdc_phonet usbcore phonet hid_jabra hid extcon_arizona can_dev industrialio_triggered_buffer kfifo_buf industrialio adm1031 i2c_mux_ltc4306 i2c_mux ipmi_msghandler mlxsw_core snd_soc_cs35l34 snd_soc_core snd_pcm_dmaengine snd_pcm snd_timer ac97_bus snd_compress snd soundcore gpio_da9055 uio ecdh_generic mdio_thunder of_mdio fixed_phy libphy mdio_cavium iptable_security iptable_raw iptable_mangle iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter bpfilter ip6_vti ip_vti ip_gre ipip sit tunnel4 ip_tunnel hsr veth netdevsim vxcan batman_adv cfg80211 rfkill chnl_net caif nlmon dummy team bonding vcan bridge stp llc ip6_gre gre ip6_tunnel tunnel6 tun joydev mousedev ppdev tpm kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel ide_pci_generic piix aes_x86_64 crypto_simd cryptd ide_core glue_helper input_leds psmouse intel_agp intel_gtt serio_raw ata_generic i2c_piix4 agpgart pata_acpi parport_pc parport floppy rtc_cmos sch_fq_codel ip_tables x_tables sha1_ssse3 sha1_generic ipv6 [last unloaded: br_netfilter] Dumping ftrace buffer: (ftrace buffer empty) ---[ end trace 68741688d5fbfe85 ]--- commit 23da9588037e ("fs/proc/proc_sysctl.c: fix NULL pointer dereference in put_links") forgot to handle start_unregistering() case, while header->parent is NULL, it calls erase_header() and as seen in the above syzkaller call trace, accessing &header->parent->root will trigger a NULL pointer dereference. As that commit explained, there is also no need to call start_unregistering() if header->parent is NULL. Link: http://lkml.kernel.org/r/20190409153622.28112-1-yuehaibing@huawei.com Fixes: 23da9588037e ("fs/proc/proc_sysctl.c: fix NULL pointer dereference in put_links") Fixes: 0e47c99d7fe25 ("sysctl: Replace root_list with links between sysctl_table_sets") Signed-off-by: YueHaibing Reported-by: Hulk Robot Reviewed-by: Kees Cook Cc: Luis Chamberlain Cc: Alexey Dobriyan Cc: Al Viro Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d65390727541..7325baa8f9d4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1626,9 +1626,11 @@ static void drop_sysctl_table(struct ctl_table_header *header) if (--header->nreg) return; - if (parent) + if (parent) { put_links(header); - start_unregistering(header); + start_unregistering(header); + } + if (!--header->count) kfree_rcu(header, rcu); -- cgit