From bcfe06bf2622f7c4899468e427683aec49070687 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:27 -0800 Subject: mm: memcontrol: Use helpers to read page's memcg data Patch series "mm: allow mapping accounted kernel pages to userspace", v6. Currently a non-slab kernel page which has been charged to a memory cgroup can't be mapped to userspace. The underlying reason is simple: PageKmemcg flag is defined as a page type (like buddy, offline, etc), so it takes a bit from a page->mapped counter. Pages with a type set can't be mapped to userspace. But in general the kmemcg flag has nothing to do with mapping to userspace. It only means that the page has been accounted by the page allocator, so it has to be properly uncharged on release. Some bpf maps are mapping the vmalloc-based memory to userspace, and their memory can't be accounted because of this implementation detail. This patchset removes this limitation by moving the PageKmemcg flag into one of the free bits of the page->mem_cgroup pointer. Also it formalizes accesses to the page->mem_cgroup and page->obj_cgroups using new helpers, adds several checks and removes a couple of obsolete functions. As the result the code became more robust with fewer open-coded bit tricks. This patch (of 4): Currently there are many open-coded reads of the page->mem_cgroup pointer, as well as a couple of read helpers, which are barely used. It creates an obstacle on a way to reuse some bits of the pointer for storing additional bits of information. In fact, we already do this for slab pages, where the last bit indicates that a pointer has an attached vector of objcg pointers instead of a regular memcg pointer. This commits uses 2 existing helpers and introduces a new helper to converts all read sides to calls of these helpers: struct mem_cgroup *page_memcg(struct page *page); struct mem_cgroup *page_memcg_rcu(struct page *page); struct mem_cgroup *page_memcg_check(struct page *page); page_memcg_check() is intended to be used in cases when the page can be a slab page and have a memcg pointer pointing at objcg vector. It does check the lowest bit, and if set, returns NULL. page_memcg() contains a VM_BUG_ON_PAGE() check for the page not being a slab page. To make sure nobody uses a direct access, struct page's mem_cgroup/obj_cgroups is converted to unsigned long memcg_data. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com --- include/linux/memcontrol.h | 114 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 105 insertions(+), 9 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e391e3c56de5..f95c1433461c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -343,6 +343,79 @@ struct mem_cgroup { extern struct mem_cgroup *root_mem_cgroup; +/* + * page_memcg - get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages. + * + * Any of the following ensures page and memcg binding stability: + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + */ +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + VM_BUG_ON_PAGE(PageSlab(page), page); + return (struct mem_cgroup *)page->memcg_data; +} + +/* + * page_memcg_rcu - locklessly get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages. + */ +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + VM_BUG_ON_PAGE(PageSlab(page), page); + WARN_ON_ONCE(!rcu_read_lock_held()); + + return (struct mem_cgroup *)READ_ONCE(page->memcg_data); +} + +/* + * page_memcg_check - get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function unlike page_memcg() can take any page + * as an argument. It has to be used in cases when it's not known if a page + * has an associated memory cgroup pointer or an object cgroups vector. + * + * Any of the following ensures page and memcg binding stability: + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + */ +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + /* + * Because page->memcg_data might be changed asynchronously + * for slab pages, READ_ONCE() should be used here. + */ + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + /* + * The lowest bit set means that memcg isn't a valid + * memcg pointer, but a obj_cgroups pointer. + * In this case the page is shared and doesn't belong + * to any specific memory cgroup. + */ + if (memcg_data & 0x1UL) + return NULL; + + return (struct mem_cgroup *)memcg_data; +} + static __always_inline bool memcg_stat_item_in_bytes(int idx) { if (idx == MEMCG_PERCPU_B) @@ -743,15 +816,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, static inline void __mod_memcg_page_state(struct page *page, int idx, int val) { - if (page->mem_cgroup) - __mod_memcg_state(page->mem_cgroup, idx, val); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + __mod_memcg_state(memcg, idx, val); } static inline void mod_memcg_page_state(struct page *page, int idx, int val) { - if (page->mem_cgroup) - mod_memcg_state(page->mem_cgroup, idx, val); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + mod_memcg_state(memcg, idx, val); } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, @@ -834,16 +911,17 @@ static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { struct page *head = compound_head(page); /* rmap on tail pages */ + struct mem_cgroup *memcg = page_memcg(head); pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; /* Untracked pages have no memcg, no lruvec. Update only the node */ - if (!head->mem_cgroup) { + if (!memcg) { __mod_node_page_state(pgdat, idx, val); return; } - lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); + lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); } @@ -878,8 +956,10 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, static inline void count_memcg_page_event(struct page *page, enum vm_event_item idx) { - if (page->mem_cgroup) - count_memcg_events(page->mem_cgroup, idx, 1); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + count_memcg_events(memcg, idx, 1); } static inline void count_memcg_event_mm(struct mm_struct *mm, @@ -941,6 +1021,22 @@ void mem_cgroup_split_huge_fixup(struct page *head); struct mem_cgroup; +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + return NULL; +} + +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return NULL; +} + +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + return NULL; +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; @@ -1430,7 +1526,7 @@ static inline void mem_cgroup_track_foreign_dirty(struct page *page, if (mem_cgroup_disabled()) return; - if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) + if (unlikely(&page_memcg(page)->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(page, wb); } -- cgit From 270c6a71460e12b07b1dcadf7457ff95b6c6e8f4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:28 -0800 Subject: mm: memcontrol/slab: Use helpers to access slab page's memcg_data To gather all direct accesses to struct page's memcg_data field in one place, let's introduce 3 new helpers to use in the slab accounting code: struct obj_cgroup **page_objcgs(struct page *page); struct obj_cgroup **page_objcgs_check(struct page *page); bool set_page_objcgs(struct page *page, struct obj_cgroup **objcgs); They are similar to the corresponding API for generic pages, except that the setter can return false, indicating that the value has been already set from a different thread. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20201027001657.3398190-3-guro@fb.com Link: https://lore.kernel.org/bpf/20201201215900.3569844-3-guro@fb.com --- include/linux/memcontrol.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f95c1433461c..c7ac0a5b8989 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -416,6 +416,70 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return (struct mem_cgroup *)memcg_data; } +#ifdef CONFIG_MEMCG_KMEM +/* + * page_objcgs - get the object cgroups vector associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the object cgroups vector associated with the page, + * or NULL. This function assumes that the page is known to have an + * associated object cgroups vector. It's not safe to call this function + * against pages, which might have an associated memory cgroup: e.g. + * kernel stack pages. + */ +static inline struct obj_cgroup **page_objcgs(struct page *page) +{ + return (struct obj_cgroup **)(READ_ONCE(page->memcg_data) & ~0x1UL); +} + +/* + * page_objcgs_check - get the object cgroups vector associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the object cgroups vector associated with the page, + * or NULL. This function is safe to use if the page can be directly associated + * with a memory cgroup. + */ +static inline struct obj_cgroup **page_objcgs_check(struct page *page) +{ + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + if (memcg_data && (memcg_data & 0x1UL)) + return (struct obj_cgroup **)(memcg_data & ~0x1UL); + + return NULL; +} + +/* + * set_page_objcgs - associate a page with a object cgroups vector + * @page: a pointer to the page struct + * @objcgs: a pointer to the object cgroups vector + * + * Atomically associates a page with a vector of object cgroups. + */ +static inline bool set_page_objcgs(struct page *page, + struct obj_cgroup **objcgs) +{ + return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | 0x1UL); +} +#else +static inline struct obj_cgroup **page_objcgs(struct page *page) +{ + return NULL; +} + +static inline struct obj_cgroup **page_objcgs_check(struct page *page) +{ + return NULL; +} + +static inline bool set_page_objcgs(struct page *page, + struct obj_cgroup **objcgs) +{ + return true; +} +#endif + static __always_inline bool memcg_stat_item_in_bytes(int idx) { if (idx == MEMCG_PERCPU_B) -- cgit From 87944e2992bd28098c6806086a1e96bb4d0e502b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:29 -0800 Subject: mm: Introduce page memcg flags The lowest bit in page->memcg_data is used to distinguish between struct memory_cgroup pointer and a pointer to a objcgs array. All checks and modifications of this bit are open-coded. Let's formalize it using page memcg flags, defined in enum page_memcg_data_flags. Additional flags might be added later. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Link: https://lkml.kernel.org/r/20201027001657.3398190-4-guro@fb.com Link: https://lore.kernel.org/bpf/20201201215900.3569844-4-guro@fb.com --- include/linux/memcontrol.h | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c7ac0a5b8989..99a4841d658b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -343,6 +343,15 @@ struct mem_cgroup { extern struct mem_cgroup *root_mem_cgroup; +enum page_memcg_data_flags { + /* page->memcg_data is a pointer to an objcgs vector */ + MEMCG_DATA_OBJCGS = (1UL << 0), + /* the next bit after the last actual flag */ + __NR_MEMCG_DATA_FLAGS = (1UL << 1), +}; + +#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) + /* * page_memcg - get the memory cgroup associated with a page * @page: a pointer to the page struct @@ -404,13 +413,7 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) */ unsigned long memcg_data = READ_ONCE(page->memcg_data); - /* - * The lowest bit set means that memcg isn't a valid - * memcg pointer, but a obj_cgroups pointer. - * In this case the page is shared and doesn't belong - * to any specific memory cgroup. - */ - if (memcg_data & 0x1UL) + if (memcg_data & MEMCG_DATA_OBJCGS) return NULL; return (struct mem_cgroup *)memcg_data; @@ -429,7 +432,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) */ static inline struct obj_cgroup **page_objcgs(struct page *page) { - return (struct obj_cgroup **)(READ_ONCE(page->memcg_data) & ~0x1UL); + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page); + + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* @@ -444,10 +451,10 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) { unsigned long memcg_data = READ_ONCE(page->memcg_data); - if (memcg_data && (memcg_data & 0x1UL)) - return (struct obj_cgroup **)(memcg_data & ~0x1UL); + if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS)) + return NULL; - return NULL; + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* @@ -460,7 +467,8 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) static inline bool set_page_objcgs(struct page *page, struct obj_cgroup **objcgs) { - return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | 0x1UL); + return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | + MEMCG_DATA_OBJCGS); } #else static inline struct obj_cgroup **page_objcgs(struct page *page) -- cgit From 18b2db3b0385226b71cb3288474fa5a6e4a45474 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:30 -0800 Subject: mm: Convert page kmemcg type to a page memcg flag PageKmemcg flag is currently defined as a page type (like buddy, offline, table and guard). Semantically it means that the page was accounted as a kernel memory by the page allocator and has to be uncharged on the release. As a side effect of defining the flag as a page type, the accounted page can't be mapped to userspace (look at page_has_type() and comments above). In particular, this blocks the accounting of vmalloc-backed memory used by some bpf maps, because these maps do map the memory to userspace. One option is to fix it by complicating the access to page->mapcount, which provides some free bits for page->page_type. But it's way better to move this flag into page->memcg_data flags. Indeed, the flag makes no sense without enabled memory cgroups and memory cgroup pointer set in particular. This commit replaces PageKmemcg() and __SetPageKmemcg() with PageMemcgKmem() and an open-coded OR operation setting the memcg pointer with the MEMCG_DATA_KMEM bit. __ClearPageKmemcg() can be simple deleted, as the whole memcg_data is zeroed at once. As a bonus, on !CONFIG_MEMCG build the PageMemcgKmem() check will be compiled out. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Link: https://lkml.kernel.org/r/20201027001657.3398190-5-guro@fb.com Link: https://lore.kernel.org/bpf/20201201215900.3569844-5-guro@fb.com --- include/linux/memcontrol.h | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 99a4841d658b..7c9d43476166 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -346,8 +346,10 @@ extern struct mem_cgroup *root_mem_cgroup; enum page_memcg_data_flags { /* page->memcg_data is a pointer to an objcgs vector */ MEMCG_DATA_OBJCGS = (1UL << 0), + /* page has been accounted as a non-slab kernel page */ + MEMCG_DATA_KMEM = (1UL << 1), /* the next bit after the last actual flag */ - __NR_MEMCG_DATA_FLAGS = (1UL << 1), + __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) @@ -369,8 +371,12 @@ enum page_memcg_data_flags { */ static inline struct mem_cgroup *page_memcg(struct page *page) { + unsigned long memcg_data = page->memcg_data; + VM_BUG_ON_PAGE(PageSlab(page), page); - return (struct mem_cgroup *)page->memcg_data; + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); + + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* @@ -387,7 +393,8 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) VM_BUG_ON_PAGE(PageSlab(page), page); WARN_ON_ONCE(!rcu_read_lock_held()); - return (struct mem_cgroup *)READ_ONCE(page->memcg_data); + return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) & + ~MEMCG_DATA_FLAGS_MASK); } /* @@ -416,7 +423,21 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) if (memcg_data & MEMCG_DATA_OBJCGS) return NULL; - return (struct mem_cgroup *)memcg_data; + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * PageMemcgKmem - check if the page has MemcgKmem flag set + * @page: a pointer to the page struct + * + * Checks if the page has MemcgKmem flag set. The caller must ensure that + * the page has an associated memory cgroup. It's not safe to call this function + * against some types of pages, e.g. slab pages. + */ +static inline bool PageMemcgKmem(struct page *page) +{ + VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page); + return page->memcg_data & MEMCG_DATA_KMEM; } #ifdef CONFIG_MEMCG_KMEM @@ -435,6 +456,7 @@ static inline struct obj_cgroup **page_objcgs(struct page *page) unsigned long memcg_data = READ_ONCE(page->memcg_data); VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } @@ -454,6 +476,8 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS)) return NULL; + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } @@ -1109,6 +1133,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return NULL; } +static inline bool PageMemcgKmem(struct page *page) +{ + return false; +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; -- cgit