From d8f5f7e445f02eb10dee1a0a992146314cf460f8 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 29 Aug 2023 14:37:34 -0700 Subject: hugetlb: set hugetlb page flag before optimizing vmemmap Currently, vmemmap optimization of hugetlb pages is performed before the hugetlb flag (previously hugetlb destructor) is set identifying it as a hugetlb folio. This means there is a window of time where an ordinary folio does not have all associated vmemmap present. The core mm only expects vmemmap to be potentially optimized for hugetlb and device dax. This can cause problems in code such as memory error handling that may want to write to tail struct pages. There is only one call to perform hugetlb vmemmap optimization today. To fix this issue, simply set the hugetlb flag before that call. There was a similar issue in the free hugetlb path that was previously addressed. The two routines that optimize or restore hugetlb vmemmap should only be passed hugetlb folios/pages. To catch any callers not following this rule, add VM_WARN_ON calls to the routines. In the hugetlb free code paths, some calls could be made to restore vmemmap after clearing the hugetlb flag. This was 'safe' as in these cases vmemmap was already present and the call was a NOOP. However, for consistency these calls where eliminated so that we can add the VM_WARN_ON checks. Link: https://lkml.kernel.org/r/20230829213734.69673-1-mike.kravetz@oracle.com Fixes: f41f2ed43ca5 ("mm: hugetlb: free the vmemmap pages associated with each HugeTLB page") Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: James Houghton Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Usama Arif Signed-off-by: Andrew Morton --- mm/hugetlb.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 52d26072dfda55..afce9037f9ee9c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1720,7 +1720,12 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - if (hugetlb_vmemmap_restore(h, &folio->page)) { + /* + * If folio is not vmemmap optimized (!clear_dtor), then the folio + * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore + * can only be passed hugetlb pages and will BUG otherwise. + */ + if (clear_dtor && hugetlb_vmemmap_restore(h, &folio->page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1930,9 +1935,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { + folio_set_hugetlb(folio); hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); - folio_set_hugetlb(folio); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); @@ -3580,13 +3585,21 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_restore(h, &folio->page); - if (rc) { - /* Allocation of vmemmmap failed, we can not demote folio */ - spin_lock_irq(&hugetlb_lock); - folio_ref_unfreeze(folio, 1); - add_hugetlb_folio(h, folio, false); - return rc; + /* + * If vmemmap already existed for folio, the remove routine above would + * have cleared the hugetlb folio flag. Hence the folio is technically + * no longer a hugetlb folio. hugetlb_vmemmap_restore can only be + * passed hugetlb folios and will BUG otherwise. + */ + if (folio_test_hugetlb(folio)) { + rc = hugetlb_vmemmap_restore(h, &folio->page); + if (rc) { + /* Allocation of vmemmmap failed, we can not demote folio */ + spin_lock_irq(&hugetlb_lock); + folio_ref_unfreeze(folio, 1); + add_hugetlb_folio(h, folio, false); + return rc; + } } /* -- cgit 1.2.3-korg From b72b3c9c34c825c81d205241c5f822fc7835923f Mon Sep 17 00:00:00 2001 From: Xueshi Hu Date: Tue, 29 Aug 2023 11:33:43 +0800 Subject: mm/hugetlb: fix nodes huge page allocation when there are surplus pages In set_nr_huge_pages(), local variable "count" is used to record persistent_huge_pages(), but when it cames to nodes huge page allocation, the semantics changes to nr_huge_pages. When there exists surplus huge pages and using the interface under /sys/devices/system/node/node*/hugepages to change huge page pool size, this difference can result in the allocation of an unexpected number of huge pages. Steps to reproduce the bug: Starting with: Node 0 Node 1 Total HugePages_Total 0.00 0.00 0.00 HugePages_Free 0.00 0.00 0.00 HugePages_Surp 0.00 0.00 0.00 create 100 huge pages in Node 0 and consume it, then set Node 0 's nr_hugepages to 0. yields: Node 0 Node 1 Total HugePages_Total 200.00 0.00 200.00 HugePages_Free 0.00 0.00 0.00 HugePages_Surp 200.00 0.00 200.00 write 100 to Node 1's nr_hugepages echo 100 > /sys/devices/system/node/node1/\ hugepages/hugepages-2048kB/nr_hugepages gets: Node 0 Node 1 Total HugePages_Total 200.00 400.00 600.00 HugePages_Free 0.00 400.00 400.00 HugePages_Surp 200.00 0.00 200.00 Kernel is expected to create only 100 huge pages and it gives 200. Link: https://lkml.kernel.org/r/20230829033343.467779-1-xueshi.hu@smartx.com Fixes: 9a30523066cd ("hugetlb: add per node hstate attributes") Signed-off-by: Xueshi Hu Reviewed-by: Mike Kravetz Cc: Andi Kleen Cc: Lee Schermerhorn Cc: Mel Gorman Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index afce9037f9ee9c..7c90c43574a643 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3457,7 +3457,9 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, if (nid != NUMA_NO_NODE) { unsigned long old_count = count; - count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; + count += persistent_huge_pages(h) - + (h->nr_huge_pages_node[nid] - + h->surplus_huge_pages_node[nid]); /* * User may have specified a large count value which caused the * above calculation to overflow. In this case, they wanted -- cgit 1.2.3-korg From 426056efe835cf4864ccf4c328fe3af9146fc539 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 13 Sep 2023 16:12:45 -0400 Subject: mm/hugetlb: use nth_page() in place of direct struct page manipulation When dealing with hugetlb pages, manipulating struct page pointers directly can get to wrong struct page, since struct page is not guaranteed to be contiguous on SPARSEMEM without VMEMMAP. Use nth_page() to handle it properly. A wrong or non-existing page might be tried to be grabbed, either leading to a non freeable page or kernel memory access errors. No bug is reported. It comes from code inspection. Link: https://lkml.kernel.org/r/20230913201248.452081-3-zi.yan@sent.com Fixes: 57a196a58421 ("hugetlb: simplify hugetlb handling in follow_page_mask") Signed-off-by: Zi Yan Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Thomas Bogendoerfer Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c90c43574a643..a945efe2858af7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6493,7 +6493,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, } } - page += ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT)); /* * Note that page may be a sub-page, and with vmemmap -- cgit 1.2.3-korg From fde1c4ecf91640e5a95ec36b71ec2e8ec379ce40 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 13 Sep 2023 11:54:01 +0100 Subject: mm: hugetlb: skip initialization of gigantic tail struct pages if freed by HVO The new boot flow when it comes to initialization of gigantic pages is as follows: - At boot time, for a gigantic page during __alloc_bootmem_hugepage, the region after the first struct page is marked as noinit. - This results in only the first struct page to be initialized in reserve_bootmem_region. As the tail struct pages are not initialized at this point, there can be a significant saving in boot time if HVO succeeds later on. - Later on in the boot, the head page is prepped and the first HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page) - 1 tail struct pages are initialized. - HVO is attempted. If it is not successful, then the rest of the tail struct pages are initialized. If it is successful, no more tail struct pages need to be initialized saving significant boot time. The WARN_ON for increased ref count in gather_bootmem_prealloc was changed to a VM_BUG_ON. This is OK as there should be no speculative references this early in boot process. The VM_BUG_ON's are there just in case such code is introduced. [akpm@linux-foundation.org: make it nicer for 80 cols] Link: https://lkml.kernel.org/r/20230913105401.519709-5-usama.arif@bytedance.com Signed-off-by: Usama Arif Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Fam Zheng Cc: Mike Rapoport (IBM) Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/hugetlb.c | 68 +++++++++++++++++++++++++++++++++++++++++++++------- mm/hugetlb_vmemmap.c | 2 +- mm/hugetlb_vmemmap.h | 9 +++---- mm/internal.h | 3 +++ mm/mm_init.c | 2 +- 5 files changed, 69 insertions(+), 15 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a945efe2858af7..4e276466d6aa42 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3169,6 +3169,16 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) } found: + + /* + * Only initialize the head struct page in memmap_init_reserved_pages, + * rest of the struct pages will be initialized by the HugeTLB + * subsystem itself. + * The head struct page is used to get folio information by the HugeTLB + * subsystem like zone id and node id. + */ + memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE), + huge_page_size(h) - PAGE_SIZE); /* Put them into a private list first because mem_map is not up yet */ INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages); @@ -3176,6 +3186,43 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) return 1; } +/* Initialize [start_page:end_page_number] tail struct pages of a hugepage */ +static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, + unsigned long start_page_number, + unsigned long end_page_number) +{ + enum zone_type zone = zone_idx(folio_zone(folio)); + int nid = folio_nid(folio); + unsigned long head_pfn = folio_pfn(folio); + unsigned long pfn, end_pfn = head_pfn + end_page_number; + int ret; + + for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_single_page(page, pfn, zone, nid); + prep_compound_tail((struct page *)folio, pfn - head_pfn); + ret = page_ref_freeze(page, 1); + VM_BUG_ON(!ret); + } +} + +static void __init hugetlb_folio_init_vmemmap(struct folio *folio, + struct hstate *h, + unsigned long nr_pages) +{ + int ret; + + /* Prepare folio head */ + __folio_clear_reserved(folio); + __folio_set_head(folio); + ret = page_ref_freeze(&folio->page, 1); + VM_BUG_ON(!ret); + /* Initialize the necessary tail struct pages */ + hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); + prep_compound_head((struct page *)folio, huge_page_order(h)); +} + /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_ORDER) pages. @@ -3186,19 +3233,21 @@ static void __init gather_bootmem_prealloc(void) list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); - struct folio *folio = page_folio(page); + struct folio *folio = (void *)page; struct hstate *h = m->hstate; VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(folio_ref_count(folio) != 1); - if (prep_compound_gigantic_folio(folio, huge_page_order(h))) { - WARN_ON(folio_test_reserved(folio)); - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); - free_huge_folio(folio); /* add to the hugepage allocator */ - } else { - /* VERY unlikely inflated ref count on a tail page */ - free_gigantic_folio(folio, huge_page_order(h)); - } + + hugetlb_folio_init_vmemmap(folio, h, + HUGETLB_VMEMMAP_RESERVE_PAGES); + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + /* If HVO fails, initialize all tail struct pages */ + if (!HPageVmemmapOptimized(&folio->page)) + hugetlb_folio_init_tail_vmemmap(folio, + HUGETLB_VMEMMAP_RESERVE_PAGES, + pages_per_huge_page(h)); + free_huge_folio(folio); /* add to the hugepage allocator */ /* * We need to restore the 'stolen' pages to totalram_pages @@ -3209,6 +3258,7 @@ static void __init gather_bootmem_prealloc(void) cond_resched(); } } + static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ad650c7b07ec4c..76682d1d79a746 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -588,7 +588,7 @@ static int __init hugetlb_vmemmap_init(void) const struct hstate *h; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ - BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE); + BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 25bd0e00243140..4573899855d706 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -10,15 +10,16 @@ #define _LINUX_HUGETLB_VMEMMAP_H #include -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); -void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); - /* * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See * Documentation/vm/vmemmap_dedup.rst. */ #define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE +#define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) + +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { diff --git a/mm/internal.h b/mm/internal.h index a273f4d948d86f..f7c963dfbdb349 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1155,6 +1155,9 @@ struct vma_prepare { struct vm_area_struct *remove2; }; +void __meminit __init_single_page(struct page *page, unsigned long pfn, + unsigned long zone, int nid); + /* shrinker related functions */ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); diff --git a/mm/mm_init.c b/mm/mm_init.c index 6be6f50813b1eb..077bfe393b5e29 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -555,7 +555,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) node_states[N_MEMORY] = saved_node_state; } -static void __meminit __init_single_page(struct page *page, unsigned long pfn, +void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { mm_zero_struct_page(page); -- cgit 1.2.3-korg From 3ec145f9d01e799bc7e41277e571141546b850f3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Aug 2023 15:13:23 +0100 Subject: hugetlb: use a folio in free_hpage_workfn() Patch series "Small hugetlb cleanups", v2. Some trivial folio conversions This patch (of 3): update_and_free_hugetlb_folio puts the memory on hpage_freelist as a folio so we can take it off the list as a folio. Link: https://lkml.kernel.org/r/20230824141325.2704553-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230824141325.2704553-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4e276466d6aa42..7a6fe4f137779f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1787,22 +1787,22 @@ static void free_hpage_workfn(struct work_struct *work) node = llist_del_all(&hpage_freelist); while (node) { - struct page *page; + struct folio *folio; struct hstate *h; - page = container_of((struct address_space **)node, - struct page, mapping); + folio = container_of((struct address_space **)node, + struct folio, mapping); node = node->next; - page->mapping = NULL; + folio->mapping = NULL; /* * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in * folio_hstate() is going to trigger because a previous call to * remove_hugetlb_folio() will clear the hugetlb bit, so do * not use folio_hstate() directly. */ - h = size_to_hstate(page_size(page)); + h = size_to_hstate(folio_size(folio)); - __update_and_free_hugetlb_folio(h, page_folio(page)); + __update_and_free_hugetlb_folio(h, folio); cond_resched(); } -- cgit 1.2.3-korg From 04bbfd844b99c7c89a344236d3758e7a2d41572f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Aug 2023 15:13:24 +0100 Subject: hugetlb: remove a few calls to page_folio() Anything found on a linked list threaded through ->lru is guaranteed to be a folio as the compound_head found in a tail page overlaps the ->lru member of struct page. So we can pull folios directly off these lists no matter whether pages or folios were added to the list. Link: https://lkml.kernel.org/r/20230824141325.2704553-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a6fe4f137779f..e91ce966a2c913 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1836,11 +1836,9 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { - struct page *page, *t_page; - struct folio *folio; + struct folio *folio, *t_folio; - list_for_each_entry_safe(page, t_page, list, lru) { - folio = page_folio(page); + list_for_each_entry_safe(folio, t_folio, list, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } @@ -2229,8 +2227,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, bool acct_surplus) { int nr_nodes, node; - struct page *page = NULL; - struct folio *folio; + struct folio *folio = NULL; lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { @@ -2240,15 +2237,14 @@ static struct page *remove_pool_huge_page(struct hstate *h, */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { - page = list_entry(h->hugepage_freelists[node].next, - struct page, lru); - folio = page_folio(page); + folio = list_entry(h->hugepage_freelists[node].next, + struct folio, lru); remove_hugetlb_folio(h, folio, acct_surplus); break; } } - return page; + return &folio->page; } /* @@ -3414,15 +3410,15 @@ static void try_to_free_low(struct hstate *h, unsigned long count, * Collect pages to be freed on a list, and free after dropping lock */ for_each_node_mask(i, *nodes_allowed) { - struct page *page, *next; + struct folio *folio, *next; struct list_head *freel = &h->hugepage_freelists[i]; - list_for_each_entry_safe(page, next, freel, lru) { + list_for_each_entry_safe(folio, next, freel, lru) { if (count >= h->nr_huge_pages) goto out; - if (PageHighMem(page)) + if (folio_test_highmem(folio)) continue; - remove_hugetlb_folio(h, page_folio(page), false); - list_add(&page->lru, &page_list); + remove_hugetlb_folio(h, folio, false); + list_add(&folio->lru, &page_list); } } -- cgit 1.2.3-korg From d5b43e9683ec5c78524f9ae93c1824edcbc1f08d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Aug 2023 15:13:25 +0100 Subject: hugetlb: convert remove_pool_huge_page() to remove_pool_hugetlb_folio() Convert the callers to expect a folio and remove the unnecesary conversion back to a struct page. Link: https://lkml.kernel.org/r/20230824141325.2704553-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e91ce966a2c913..de220e3ff8befd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1446,7 +1446,7 @@ static int hstate_next_node_to_alloc(struct hstate *h, } /* - * helper for remove_pool_huge_page() - return the previously saved + * helper for remove_pool_hugetlb_folio() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. @@ -2222,9 +2222,8 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ -static struct page *remove_pool_huge_page(struct hstate *h, - nodemask_t *nodes_allowed, - bool acct_surplus) +static struct folio *remove_pool_hugetlb_folio(struct hstate *h, + nodemask_t *nodes_allowed, bool acct_surplus) { int nr_nodes, node; struct folio *folio = NULL; @@ -2244,7 +2243,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, } } - return &folio->page; + return folio; } /* @@ -2598,7 +2597,6 @@ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; - struct page *page; LIST_HEAD(page_list); lockdep_assert_held(&hugetlb_lock); @@ -2619,15 +2617,17 @@ static void return_unused_surplus_pages(struct hstate *h, * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. - * remove_pool_huge_page() will balance the freed pages across the + * remove_pool_hugetlb_folio() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { - page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); - if (!page) + struct folio *folio; + + folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1); + if (!folio) goto out; - list_add(&page->lru, &page_list); + list_add(&folio->lru, &page_list); } out: @@ -3472,7 +3472,6 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; - struct page *page; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); @@ -3594,11 +3593,13 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * Collect pages to be removed on list without dropping lock */ while (min_count < persistent_huge_pages(h)) { - page = remove_pool_huge_page(h, nodes_allowed, 0); - if (!page) + struct folio *folio; + + folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); + if (!folio) break; - list_add(&page->lru, &page_list); + list_add(&folio->lru, &page_list); } /* free the pages after dropping lock */ spin_unlock_irq(&hugetlb_lock); -- cgit 1.2.3-korg From a08c7193e4f18dc8508f2d07d0de2c5b94cb39a3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 26 Sep 2023 12:20:17 -0700 Subject: mm/filemap: remove hugetlb special casing in filemap.c Remove special cased hugetlb handling code within the page cache by changing the granularity of ->index to the base page size rather than the huge page size. The motivation of this patch is to reduce complexity within the filemap code while also increasing performance by removing branches that are evaluated on every page cache lookup. To support the change in index, new wrappers for hugetlb page cache interactions are added. These wrappers perform the conversion to a linear index which is now expected by the page cache for huge pages. ========================= PERFORMANCE ====================================== Perf was used to check the performance differences after the patch. Overall the performance is similar to mainline with a very small larger overhead that occurs in __filemap_add_folio() and hugetlb_add_to_page_cache(). This is because of the larger overhead that occurs in xa_load() and xa_store() as the xarray is now using more entries to store hugetlb folios in the page cache. Timing aarch64 2MB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-1 hugepages]# time fallocate -l 700GB test.txt real 1m49.568s user 0m0.000s sys 1m49.461s 6.5-rc3: [root]# time fallocate -l 700GB test.txt real 1m47.495s user 0m0.000s sys 1m47.370s 1GB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt real 1m47.024s user 0m0.000s sys 1m46.921s 6.5-rc3: [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt real 1m44.551s user 0m0.000s sys 1m44.438s x86 2MB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-2 hugepages]# time fallocate -l 100GB test.txt real 0m22.383s user 0m0.000s sys 0m22.255s 6.5-rc3: [opc@sidhakum-ol9-2 hugepages]$ time sudo fallocate -l 100GB /dev/hugepages/test.txt real 0m22.735s user 0m0.038s sys 0m22.567s 1GB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-2 hugepages1GB]# time fallocate -l 100GB test.txt real 0m25.786s user 0m0.001s sys 0m25.589s 6.5-rc3: [root@sidhakum-ol9-2 hugepages1G]# time fallocate -l 100GB test.txt real 0m33.454s user 0m0.001s sys 0m33.193s aarch64: workload - fallocate a 700GB file backed by huge pages 6.5-rc3 + this patch: 2MB Page Size: --100.00%--__arm64_sys_fallocate ksys_fallocate vfs_fallocate hugetlbfs_fallocate | |--95.04%--__pi_clear_page | |--3.57%--clear_huge_page | | | |--2.63%--rcu_all_qs | | | --0.91%--__cond_resched | --0.67%--__cond_resched 0.17% 0.00% 0 fallocate [kernel.vmlinux] [k] hugetlb_add_to_page_cache 0.14% 0.10% 11 fallocate [kernel.vmlinux] [k] __filemap_add_folio 6.5-rc3 2MB Page Size: --100.00%--__arm64_sys_fallocate ksys_fallocate vfs_fallocate hugetlbfs_fallocate | |--94.91%--__pi_clear_page | |--4.11%--clear_huge_page | | | |--3.00%--rcu_all_qs | | | --1.10%--__cond_resched | --0.59%--__cond_resched 0.08% 0.01% 1 fallocate [kernel.kallsyms] [k] hugetlb_add_to_page_cache 0.05% 0.03% 3 fallocate [kernel.kallsyms] [k] __filemap_add_folio x86 workload - fallocate a 100GB file backed by huge pages 6.5-rc3 + this patch: 2MB Page Size: hugetlbfs_fallocate | --99.57%--clear_huge_page | --98.47%--clear_page_erms | --0.53%--asm_sysvec_apic_timer_interrupt 0.04% 0.04% 1 fallocate [kernel.kallsyms] [k] xa_load 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] hugetlb_add_to_page_cache 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] __filemap_add_folio 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] xas_store 6.5-rc3 2MB Page Size: --99.93%--__x64_sys_fallocate vfs_fallocate hugetlbfs_fallocate | --99.38%--clear_huge_page | |--98.40%--clear_page_erms | --0.59%--__cond_resched 0.03% 0.03% 1 fallocate [kernel.kallsyms] [k] __filemap_add_folio ========================= TESTING ====================================== This patch passes libhugetlbfs tests and LTP hugetlb tests ********** TEST SUMMARY * 2M * 32-bit 64-bit * Total testcases: 110 113 * Skipped: 0 0 * PASS: 107 113 * FAIL: 0 0 * Killed by signal: 3 0 * Bad configuration: 0 0 * Expected FAIL: 0 0 * Unexpected PASS: 0 0 * Test not present: 0 0 * Strange test result: 0 0 ********** Done executing testcases. LTP Version: 20220527-178-g2761a81c4 page migration was also tested using Mike Kravetz's test program.[8] [dan.carpenter@linaro.org: fix an NULL vs IS_ERR() bug] Link: https://lkml.kernel.org/r/1772c296-1417-486f-8eef-171af2192681@moroto.mountain Link: https://lkml.kernel.org/r/20230926192017.98183-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Signed-off-by: Dan Carpenter Reported-and-tested-by: syzbot+c225dea486da4d5592bd@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c225dea486da4d5592bd Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 37 +++++++++++++++++++------------------ include/linux/hugetlb.h | 12 ++++++++++++ include/linux/pagemap.h | 32 ++------------------------------ mm/filemap.c | 34 ++++++++++------------------------ mm/hugetlb.c | 32 ++++++-------------------------- mm/migrate.c | 6 +++--- 6 files changed, 52 insertions(+), 101 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 60fce26ff9378d..926d01c493fb33 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -334,7 +334,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) ssize_t retval = 0; while (iov_iter_count(to)) { - struct page *page; + struct folio *folio; size_t nr, copied, want; /* nr is the maximum number of bytes to copy from this page */ @@ -352,18 +352,18 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } nr = nr - offset; - /* Find the page */ - page = find_lock_page(mapping, index); - if (unlikely(page == NULL)) { + /* Find the folio */ + folio = filemap_lock_hugetlb_folio(h, mapping, index); + if (IS_ERR(folio)) { /* * We have a HOLE, zero out the user-buffer for the * length of the hole or request. */ copied = iov_iter_zero(nr, to); } else { - unlock_page(page); + folio_unlock(folio); - if (!PageHWPoison(page)) + if (!folio_test_has_hwpoisoned(folio)) want = nr; else { /* @@ -371,19 +371,19 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) * touching the 1st raw HWPOISON subpage after * offset. */ - want = adjust_range_hwpoison(page, offset, nr); + want = adjust_range_hwpoison(&folio->page, offset, nr); if (want == 0) { - put_page(page); + folio_put(folio); retval = -EIO; break; } } /* - * We have the page, copy it to user space buffer. + * We have the folio, copy it to user space buffer. */ - copied = copy_page_to_iter(page, offset, want, to); - put_page(page); + copied = copy_folio_to_iter(folio, offset, want, to); + folio_put(folio); } offset += copied; retval += copied; @@ -661,21 +661,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, { struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; - const pgoff_t start = lstart >> huge_page_shift(h); - const pgoff_t end = lend >> huge_page_shift(h); + const pgoff_t end = lend >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); folio_batch_init(&fbatch); - next = start; + next = lstart >> PAGE_SHIFT; while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); ++i) { struct folio *folio = fbatch.folios[i]; u32 hash = 0; - index = folio->index; + index = folio->index >> huge_page_order(h); hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -693,7 +692,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } if (truncate_op) - (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed); + (void)hugetlb_unreserve_pages(inode, + lstart >> huge_page_shift(h), + LONG_MAX, freed); } static void hugetlbfs_evict_inode(struct inode *inode) @@ -741,7 +742,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h, pgoff_t idx = start >> huge_page_shift(h); struct folio *folio; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) return; @@ -886,7 +887,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - folio = filemap_get_folio(mapping, index); + folio = filemap_get_folio(mapping, index << huge_page_order(h)); if (!IS_ERR(folio)) { folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a30686e649f7ac..d935b0584f1d15 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -812,6 +812,12 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h) return huge_page_size(h) / 512; } +static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, + struct address_space *mapping, pgoff_t idx) +{ + return filemap_lock_folio(mapping, idx << huge_page_order(h)); +} + #include #ifndef is_hugepage_only_range @@ -1008,6 +1014,12 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio return NULL; } +static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, + struct address_space *mapping, pgoff_t idx) +{ + return NULL; +} + static inline int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 351c3b7f93a14e..759b29d9a69ac8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -789,9 +789,6 @@ static inline pgoff_t folio_next_index(struct folio *folio) */ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) { - /* HugeTLBfs indexes the page cache in units of hpage_size */ - if (folio_test_hugetlb(folio)) - return &folio->page; return folio_page(folio, index & (folio_nr_pages(folio) - 1)); } @@ -807,9 +804,6 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) */ static inline bool folio_contains(struct folio *folio, pgoff_t index) { - /* HugeTLBfs indexes the page cache in units of hpage_size */ - if (folio_test_hugetlb(folio)) - return folio->index == index; return index - folio_index(folio) < folio_nr_pages(folio); } @@ -867,10 +861,9 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping, } /* - * Get index of the page within radix-tree (but not for hugetlb pages). - * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) + * Get the offset in PAGE_SIZE (even for hugetlb pages). */ -static inline pgoff_t page_to_index(struct page *page) +static inline pgoff_t page_to_pgoff(struct page *page) { struct page *head; @@ -885,19 +878,6 @@ static inline pgoff_t page_to_index(struct page *page) return head->index + page - head; } -extern pgoff_t hugetlb_basepage_index(struct page *page); - -/* - * Get the offset in PAGE_SIZE (even for hugetlb pages). - * (TODO: hugetlb pages should have ->index in PAGE_SIZE) - */ -static inline pgoff_t page_to_pgoff(struct page *page) -{ - if (unlikely(PageHuge(page))) - return hugetlb_basepage_index(page); - return page_to_index(page); -} - /* * Return byte-offset into filesystem object for page. */ @@ -934,24 +914,16 @@ static inline loff_t folio_file_pos(struct folio *folio) /* * Get the offset in PAGE_SIZE (even for hugetlb folios). - * (TODO: hugetlb folios should have ->index in PAGE_SIZE) */ static inline pgoff_t folio_pgoff(struct folio *folio) { - if (unlikely(folio_test_hugetlb(folio))) - return hugetlb_basepage_index(&folio->page); return folio->index; } -extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, - unsigned long address); - static inline pgoff_t linear_page_index(struct vm_area_struct *vma, unsigned long address) { pgoff_t pgoff; - if (unlikely(is_vm_hugetlb_page(vma))) - return linear_hugepage_index(vma, address); pgoff = (address - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; return pgoff; diff --git a/mm/filemap.c b/mm/filemap.c index 9481ffaf24e6e4..340c693112a915 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -131,11 +131,8 @@ static void page_cache_delete(struct address_space *mapping, mapping_set_update(&xas, mapping); - /* hugetlb pages are represented by a single entry in the xarray */ - if (!folio_test_hugetlb(folio)) { - xas_set_order(&xas, folio->index, folio_order(folio)); - nr = folio_nr_pages(folio); - } + xas_set_order(&xas, folio->index, folio_order(folio)); + nr = folio_nr_pages(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -234,7 +231,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio) if (free_folio) free_folio(folio); - if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + if (folio_test_large(folio)) refs = folio_nr_pages(folio); folio_put_refs(folio, refs); } @@ -855,14 +852,15 @@ noinline int __filemap_add_folio(struct address_space *mapping, if (!huge) { int error = mem_cgroup_charge(folio, NULL, gfp); - VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); if (error) return error; charged = true; - xas_set_order(&xas, index, folio_order(folio)); - nr = folio_nr_pages(folio); } + VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); + xas_set_order(&xas, index, folio_order(folio)); + nr = folio_nr_pages(folio); + gfp &= GFP_RECLAIM_MASK; folio_ref_add(folio, nr); folio->mapping = mapping; @@ -2040,7 +2038,7 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; - if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + if (!xa_is_value(folio)) nr = folio_nr_pages(folio); *start = indices[idx] + nr; } @@ -2104,7 +2102,7 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; - if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + if (!xa_is_value(folio)) nr = folio_nr_pages(folio); *start = indices[idx] + nr; } @@ -2145,9 +2143,6 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); - - if (folio_test_hugetlb(folio)) - nr = 1; *start = folio->index + nr; goto out; } @@ -2213,9 +2208,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, if (!folio_batch_add(fbatch, folio)) { nr = folio_nr_pages(folio); - - if (folio_test_hugetlb(folio)) - nr = 1; *start = folio->index + nr; goto out; } @@ -2232,10 +2224,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, if (nr) { folio = fbatch->folios[nr - 1]; - if (folio_test_hugetlb(folio)) - *start = folio->index + 1; - else - *start = folio_next_index(folio); + *start = folio->index + folio_nr_pages(folio); } out: rcu_read_unlock(); @@ -2273,9 +2262,6 @@ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); - - if (folio_test_hugetlb(folio)) - nr = 1; *start = folio->index + nr; goto out; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de220e3ff8befd..0aaf28ce32e5c2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -952,7 +952,7 @@ static long region_count(struct resv_map *resv, long f, long t) /* * Convert the address within this vma to the page offset within - * the mapping, in pagecache page units; huge pages here. + * the mapping, huge page units here. */ static pgoff_t vma_hugecache_offset(struct hstate *h, struct vm_area_struct *vma, unsigned long address) @@ -961,13 +961,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -pgoff_t linear_hugepage_index(struct vm_area_struct *vma, - unsigned long address) -{ - return vma_hugecache_offset(hstate_vma(vma), vma, address); -} -EXPORT_SYMBOL_GPL(linear_hugepage_index); - /** * vma_kernel_pagesize - Page size granularity for this VMA. * @vma: The user mapping. @@ -2074,20 +2067,6 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) return NULL; } -pgoff_t hugetlb_basepage_index(struct page *page) -{ - struct page *page_head = compound_head(page); - pgoff_t index = page_index(page_head); - unsigned long compound_idx; - - if (compound_order(page_head) > MAX_ORDER) - compound_idx = page_to_pfn(page) - page_to_pfn(page_head); - else - compound_idx = page - page_head; - - return (index << compound_order(page_head)) + compound_idx; -} - static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) @@ -5772,7 +5751,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; - pgoff_t idx = vma_hugecache_offset(h, vma, address); + pgoff_t idx = linear_page_index(vma, address); struct folio *folio; folio = filemap_get_folio(mapping, idx); @@ -5789,6 +5768,7 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping struct hstate *h = hstate_inode(inode); int err; + idx <<= huge_page_order(h); __folio_set_locked(folio); err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); @@ -5896,7 +5876,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * before we get page_table_lock. */ new_folio = false; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -6205,7 +6185,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_folio = filemap_lock_folio(mapping, idx); + pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } @@ -6338,7 +6318,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, if (is_continue) { ret = -EFAULT; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) goto out; folio_in_pagecache = true; diff --git a/mm/migrate.c b/mm/migrate.c index 7d1804c4a5d93c..942f8c9cd06818 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -524,7 +524,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, int expected_count; xas_lock_irq(&xas); - expected_count = 2 + folio_has_private(src); + expected_count = folio_expected_refs(mapping, src); if (!folio_ref_freeze(src, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; @@ -533,11 +533,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, dst->index = src->index; dst->mapping = src->mapping; - folio_get(dst); + folio_ref_add(dst, folio_nr_pages(dst)); xas_store(&xas, dst); - folio_ref_unfreeze(src, expected_count - 1); + folio_ref_unfreeze(src, expected_count - folio_nr_pages(src)); xas_unlock_irq(&xas); -- cgit 1.2.3-korg From a48bf7b4757cd8de3497c2878536f46a8d2da65c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 26 Sep 2023 10:44:33 -0700 Subject: mm/hugetlb: replace page_ref_freeze() with folio_ref_freeze() in hugetlb_folio_init_vmemmap() No functional difference, folio_ref_freeze() is currently a wrapper for page_ref_freeze(). Link: https://lkml.kernel.org/r/20230926174433.81241-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Muchun Song Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Usama Arif Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0aaf28ce32e5c2..36b40bc9ac250e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3191,7 +3191,7 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, /* Prepare folio head */ __folio_clear_reserved(folio); __folio_set_head(folio); - ret = page_ref_freeze(&folio->page, 1); + ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); /* Initialize the necessary tail struct pages */ hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); -- cgit 1.2.3-korg From 30a89adf872d2e46323840964c95dc0ae3bb5843 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 16 Oct 2023 19:55:49 -0700 Subject: hugetlb: check for hugetlb folio before vmemmap_restore In commit d8f5f7e445f0 ("hugetlb: set hugetlb page flag before optimizing vmemmap") checks were added to print a warning if hugetlb_vmemmap_restore was called on a non-hugetlb page. This was mostly due to ordering issues in the hugetlb page set up and tear down sequencees. One place missed was the routine dissolve_free_huge_page. Naoya Horiguchi noted: "I saw that VM_WARN_ON_ONCE() in hugetlb_vmemmap_restore is triggered when memory_failure() is called on a free hugetlb page with vmemmap optimization disabled (the warning is not triggered if vmemmap optimization is enabled). I think that we need check folio_test_hugetlb() before dissolve_free_huge_page() calls hugetlb_vmemmap_restore_folio()." Perform the check as suggested by Naoya. Link: https://lkml.kernel.org/r/20231017032140.GA3680@monkey Fixes: d8f5f7e445f0 ("hugetlb: set hugetlb page flag before optimizing vmemmap") Signed-off-by: Mike Kravetz Suggested-by: Naoya Horiguchi Tested-by: Naoya Horiguchi Cc: Anshuman Khandual Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Joao Martins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3fb3a5232173b6..dfb4534834f5a0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2322,17 +2322,23 @@ int dissolve_free_huge_page(struct page *page) * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. + * + * The folio_test_hugetlb check here is because + * remove_hugetlb_folio will clear hugetlb folio flag for + * non-vmemmap optimized hugetlb folios. */ - rc = hugetlb_vmemmap_restore(h, &folio->page); - if (!rc) { - update_and_free_hugetlb_folio(h, folio, false); - } else { - spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); - h->max_huge_pages++; - spin_unlock_irq(&hugetlb_lock); - } + if (folio_test_hugetlb(folio)) { + rc = hugetlb_vmemmap_restore(h, &folio->page); + if (rc) { + spin_lock_irq(&hugetlb_lock); + add_hugetlb_folio(h, folio, false); + h->max_huge_pages++; + goto out; + } + } else + rc = 0; + update_and_free_hugetlb_folio(h, folio, false); return rc; } out: -- cgit 1.2.3-korg From d61ea1cb009532dcbd77a9d44b812704cec60146 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 21 Aug 2023 19:15:13 +0500 Subject: userfaultfd: UFFD_FEATURE_WP_ASYNC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Implement IOCTL to get and optionally clear info about PTEs", v33. *Motivation* The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows GetWriteWatch() and ResetWriteWatch() syscalls [1]. The GetWriteWatch() retrieves the addresses of the pages that are written to in a region of virtual memory. This syscall is used in Windows applications and games etc. This syscall is being emulated in pretty slow manner in userspace. Our purpose is to enhance the kernel such that we translate it efficiently in a better way. Currently some out of tree hack patches are being used to efficiently emulate it in some kernels. We intend to replace those with these patches. So the whole gaming on Linux can effectively get benefit from this. It means there would be tons of users of this code. CRIU use case [2] was mentioned by Andrei and Danylo: > Use cases for migrating sparse VMAs are binaries sanitized with ASAN, > MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of > shadow memory [4]. Being able to migrate such binaries allows to highly > reduce the amount of work needed to identify and fix post-migration > crashes, which happen constantly. Andrei defines the following uses of this code: * it is more granular and allows us to track changed pages more effectively. The current interface can clear dirty bits for the entire process only. In addition, reading info about pages is a separate operation. It means we must freeze the process to read information about all its pages, reset dirty bits, only then we can start dumping pages. The information about pages becomes more and more outdated, while we are processing pages. The new interface solves both these downsides. First, it allows us to read pte bits and clear the soft-dirty bit atomically. It means that CRIU will not need to freeze processes to pre-dump their memory. Second, it clears soft-dirty bits for a specified region of memory. It means CRIU will have actual info about pages to the moment of dumping them. * The new interface has to be much faster because basic page filtering is happening in the kernel. With the old interface, we have to read pagemap for each page. *Implementation Evolution (Short Summary)* From the definition of GetWriteWatch(), we feel like kernel's soft-dirty feature can be used under the hood with some additions like: * reset soft-dirty flag for only a specific region of memory instead of clearing the flag for the entire process * get and clear soft-dirty flag for a specific region atomically So we decided to use ioctl on pagemap file to read or/and reset soft-dirty flag. But using soft-dirty flag, sometimes we get extra pages which weren't even written. They had become soft-dirty because of VMA merging and VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were able to by-pass this short coming by ignoring VM_SOFTDIRTY until David reported that mprotect etc messes up the soft-dirty flag while ignoring VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We discussed if we can revert these patches. But we could not reach to any conclusion. So at this point, I made couple of tries to solve this whole VM_SOFTDIRTY issue by correcting the soft-dirty implementation: * [7] Correct the bug fixed wrongly back in 2014. It had potential to cause regression. We left it behind. * [8] Keep a list of soft-dirty part of a VMA across splits and merges. I got the reply don't increase the size of the VMA by 8 bytes. At this point, we left soft-dirty considering it is too much delicate and userfaultfd [9] seemed like the only way forward. From there onward, we have been basing soft-dirty emulation on userfaultfd wp feature where kernel resolves the faults itself when WP_ASYNC feature is used. It was straight forward to add WP_ASYNC feature in userfautlfd. Now we get only those pages dirty or written-to which are really written in reality. (PS There is another WP_UNPOPULATED userfautfd feature is required which is needed to avoid pre-faulting memory before write-protecting [9].) All the different masks were added on the request of CRIU devs to create interface more generic and better. [1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-getwritewatch [2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com [3] https://github.com/google/sanitizers [4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit [5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com [6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/ [7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.com [8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.com [9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com [10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com This patch (of 6): Add a new userfaultfd-wp feature UFFD_FEATURE_WP_ASYNC, that allows userfaultfd wr-protect faults to be resolved by the kernel directly. It can be used like a high accuracy version of soft-dirty, without vma modifications during tracking, and also with ranged support by default rather than for a whole mm when reset the protections due to existence of ioctl(UFFDIO_WRITEPROTECT). Several goals of such a dirty tracking interface: 1. All types of memory should be supported and tracable. This is nature for soft-dirty but should mention when the context is userfaultfd, because it used to only support anon/shmem/hugetlb. The problem is for a dirty tracking purpose these three types may not be enough, and it's legal to track anything e.g. any page cache writes from mmap. 2. Protections can be applied to partial of a memory range, without vma split/merge fuss. The hope is that the tracking itself should not affect any vma layout change. It also helps when reset happens because the reset will not need mmap write lock which can block the tracee. 3. Accuracy needs to be maintained. This means we need pte markers to work on any type of VMA. One could question that, the whole concept of async dirty tracking is not really close to fundamentally what userfaultfd used to be: it's not "a fault to be serviced by userspace" anymore. However, using userfaultfd-wp here as a framework is convenient for us in at least: 1. VM_UFFD_WP vma flag, which has a very good name to suite something like this, so we don't need VM_YET_ANOTHER_SOFT_DIRTY. Just use a new feature bit to identify from a sync version of uffd-wp registration. 2. PTE markers logic can be leveraged across the whole kernel to maintain the uffd-wp bit as long as an arch supports, this also applies to this case where uffd-wp bit will be a hint to dirty information and it will not go lost easily (e.g. when some page cache ptes got zapped). 3. Reuse ioctl(UFFDIO_WRITEPROTECT) interface for either starting or resetting a range of memory, while there's no counterpart in the old soft-dirty world, hence if this is wanted in a new design we'll need a new interface otherwise. We can somehow understand that commonality because uffd-wp was fundamentally a similar idea of write-protecting pages just like soft-dirty. This implementation allows WP_ASYNC to imply WP_UNPOPULATED, because so far WP_ASYNC seems to not usable if without WP_UNPOPULATE. This also gives us chance to modify impl of WP_ASYNC just in case it could be not depending on WP_UNPOPULATED anymore in the future kernels. It's also fine to imply that because both features will rely on PTE_MARKER_UFFD_WP config option, so they'll show up together (or both missing) in an UFFDIO_API probe. vma_can_userfault() now allows any VMA if the userfaultfd registration is only about async uffd-wp. So we can track dirty for all kinds of memory including generic file systems (like XFS, EXT4 or BTRFS). One trick worth mention in do_wp_page() is that we need to manually update vmf->orig_pte here because it can be used later with a pte_same() check - this path always has FAULT_FLAG_ORIG_PTE_VALID set in the flags. The major defect of this approach of dirty tracking is we need to populate the pgtables when tracking starts. Soft-dirty doesn't do it like that. It's unwanted in the case where the range of memory to track is huge and unpopulated (e.g., tracking updates on a 10G file with mmap() on top, without having any page cache installed yet). One way to improve this is to allow pte markers exist for larger than PTE level for PMD+. That will not change the interface if to implemented, so we can leave that for later. Link: https://lkml.kernel.org/r/20230821141518.870589-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20230821141518.870589-2-usama.anjum@collabora.com Signed-off-by: Peter Xu Co-developed-by: Muhammad Usama Anjum Signed-off-by: Muhammad Usama Anjum Cc: Alex Sierra Cc: Al Viro Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Cc: Michał Mirosław Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/userfaultfd.rst | 35 ++++++++++++++++++++++++++++ fs/userfaultfd.c | 26 +++++++++++++++++---- include/linux/userfaultfd_k.h | 21 ++++++++++++++++- include/uapi/linux/userfaultfd.h | 9 ++++++- mm/hugetlb.c | 32 ++++++++++++++----------- mm/memory.c | 28 +++++++++++++++++++--- 6 files changed, 129 insertions(+), 22 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 4349a8c2b97832..203e26da5f920d 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -244,6 +244,41 @@ write-protected (so future writes will also result in a WP fault). These ioctls support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP`` respectively) to configure the mapping this way. +If the userfaultfd context has ``UFFD_FEATURE_WP_ASYNC`` feature bit set, +any vma registered with write-protection will work in async mode rather +than the default sync mode. + +In async mode, there will be no message generated when a write operation +happens, meanwhile the write-protection will be resolved automatically by +the kernel. It can be seen as a more accurate version of soft-dirty +tracking and it can be different in a few ways: + + - The dirty result will not be affected by vma changes (e.g. vma + merging) because the dirty is only tracked by the pte. + + - It supports range operations by default, so one can enable tracking on + any range of memory as long as page aligned. + + - Dirty information will not get lost if the pte was zapped due to + various reasons (e.g. during split of a shmem transparent huge page). + + - Due to a reverted meaning of soft-dirty (page clean when uffd-wp bit + set; dirty when uffd-wp bit cleared), it has different semantics on + some of the memory operations. For example: ``MADV_DONTNEED`` on + anonymous (or ``MADV_REMOVE`` on a file mapping) will be treated as + dirtying of memory by dropping uffd-wp bit during the procedure. + +The user app can collect the "written/dirty" status by looking up the +uffd-wp bit for the pages being interested in /proc/pagemap. + +The page will not be under track of uffd-wp async mode until the page is +explicitly write-protected by ``ioctl(UFFDIO_WRITEPROTECT)`` with the mode +flag ``UFFDIO_WRITEPROTECT_MODE_WP`` set. Trying to resolve a page fault +that was tracked by async mode userfaultfd-wp is invalid. + +When userfaultfd-wp async mode is used alone, it can be applied to all +kinds of memory. + Memory Poisioning Emulation --------------------------- diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 56eaae9dac1ab2..a7c6ef764e6350 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -123,6 +123,11 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) return ctx->features & UFFD_FEATURE_INITIALIZED; } +static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx) +{ + return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC); +} + /* * Whether WP_UNPOPULATED is enabled on the uffd context. It is only * meaningful when userfaultfd_wp()==true on the vma and when it's @@ -1325,6 +1330,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool basic_ioctls; unsigned long start, end, vma_end; struct vma_iterator vmi; + bool wp_async = userfaultfd_wp_async_ctx(ctx); pgoff_t pgoff; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1399,7 +1405,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (!vma_can_userfault(cur, vm_flags)) + if (!vma_can_userfault(cur, vm_flags, wp_async)) goto out_unlock; /* @@ -1460,7 +1466,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vm_flags)); + BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); @@ -1561,6 +1567,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; struct vma_iterator vmi; + bool wp_async = userfaultfd_wp_async_ctx(ctx); pgoff_t pgoff; ret = -EFAULT; @@ -1615,7 +1622,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (!vma_can_userfault(cur, cur->vm_flags)) + if (!vma_can_userfault(cur, cur->vm_flags, wp_async)) goto out_unlock; found = true; @@ -1631,7 +1638,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); + BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async)); /* * Nothing to do: this vma is already registered into this @@ -2018,6 +2025,11 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long return ret; } +bool userfaultfd_wp_async(struct vm_area_struct *vma) +{ + return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx); +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -2051,6 +2063,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) goto err_out; + + /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */ + if (features & UFFD_FEATURE_WP_ASYNC) + features |= UFFD_FEATURE_WP_UNPOPULATED; + /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR @@ -2063,6 +2080,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, #ifndef CONFIG_PTE_MARKER_UFFD_WP uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; + uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac8c6854097cd7..c98df391bfd8f5 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -161,11 +161,22 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) } static inline bool vma_can_userfault(struct vm_area_struct *vma, - unsigned long vm_flags) + unsigned long vm_flags, + bool wp_async) { + vm_flags &= __VM_UFFD_FLAGS; + if ((vm_flags & VM_UFFD_MINOR) && (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) return false; + + /* + * If wp async enabled, and WP is the only mode enabled, allow any + * memory type. + */ + if (wp_async && (vm_flags == VM_UFFD_WP)) + return true; + #ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for @@ -175,6 +186,8 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) return false; #endif + + /* By default, allow any of anon|shmem|hugetlb */ return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || vma_is_shmem(vma); } @@ -197,6 +210,7 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); +extern bool userfaultfd_wp_async(struct vm_area_struct *vma); #else /* CONFIG_USERFAULTFD */ @@ -297,6 +311,11 @@ static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) +{ + return false; +} + #endif /* CONFIG_USERFAULTFD */ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 62151706c5a38a..0dbc81015018bc 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -40,7 +40,8 @@ UFFD_FEATURE_EXACT_ADDRESS | \ UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ UFFD_FEATURE_WP_UNPOPULATED | \ - UFFD_FEATURE_POISON) + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -216,6 +217,11 @@ struct uffdio_api { * (i.e. empty ptes). This will be the default behavior for shmem * & hugetlbfs, so this flag only affects anonymous memory behavior * when userfault write-protection mode is registered. + * + * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection + * asynchronous mode is supported in which the write fault is + * automatically resolved and write-protection is un-set. + * It implies UFFD_FEATURE_WP_UNPOPULATED. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -232,6 +238,7 @@ struct uffdio_api { #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) #define UFFD_FEATURE_WP_UNPOPULATED (1<<13) #define UFFD_FEATURE_POISON (1<<14) +#define UFFD_FEATURE_WP_ASYNC (1<<15) __u64 features; __u64 ioctls; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dfb4534834f5a0..bc654b36df9f97 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6247,21 +6247,27 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Handle userfault-wp first, before trying to lock more pages */ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = address, - .flags = flags, - }; + if (!userfaultfd_wp_async(vma)) { + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .real_address = address, + .flags = flags, + }; - spin_unlock(ptl); - if (pagecache_folio) { - folio_unlock(pagecache_folio); - folio_put(pagecache_folio); + spin_unlock(ptl); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); + } + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return handle_userfault(&vmf, VM_UFFD_WP); } - hugetlb_vma_unlock_read(vma); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - return handle_userfault(&vmf, VM_UFFD_WP); + + entry = huge_pte_clear_uffd_wp(entry); + set_huge_pte_at(mm, haddr, ptep, entry); + /* Fallthrough to CoW */ } /* diff --git a/mm/memory.c b/mm/memory.c index 2d209bc4d18c10..d0f61d729fb46c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1,3 +1,4 @@ + // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c @@ -3349,11 +3350,28 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; + pte_t pte; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return handle_userfault(vmf, VM_UFFD_WP); + if (!userfaultfd_wp_async(vma)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_WP); + } + + /* + * Nothing needed (cache flush, TLB invalidations, + * etc.) because we're only removing the uffd-wp bit, + * which is completely invisible to the user. + */ + pte = pte_clear_uffd_wp(ptep_get(vmf->pte)); + + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + /* + * Update this to be prepared for following up CoW + * handling + */ + vmf->orig_pte = pte; } /* @@ -4879,8 +4897,11 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vma)) { if (likely(!unshare) && - userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) + userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) { + if (userfaultfd_wp_async(vmf->vma)) + goto split; return handle_userfault(vmf, VM_UFFD_WP); + } return do_huge_pmd_wp_page(vmf); } @@ -4892,6 +4913,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) } } +split: /* COW or write-notify handled on pte level: split pmd. */ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); -- cgit 1.2.3-korg From 52526ca7fdb905a768a93f8faa418e9b988fc34b Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 21 Aug 2023 19:15:14 +0500 Subject: fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally clear the info about page table entries. The following operations are supported in this IOCTL: - Scan the address range and get the memory ranges matching the provided criteria. This is performed when the output buffer is specified. - Write-protect the pages. The PM_SCAN_WP_MATCHING is used to write-protect the pages of interest. The PM_SCAN_CHECK_WPASYNC aborts the operation if non-Async Write Protected pages are found. The ``PM_SCAN_WP_MATCHING`` can be used with or without PM_SCAN_CHECK_WPASYNC. - Both of those operations can be combined into one atomic operation where we can get and write protect the pages as well. Following flags about pages are currently supported: - PAGE_IS_WPALLOWED - Page has async-write-protection enabled - PAGE_IS_WRITTEN - Page has been written to from the time it was write protected - PAGE_IS_FILE - Page is file backed - PAGE_IS_PRESENT - Page is present in the memory - PAGE_IS_SWAPPED - Page is in swapped - PAGE_IS_PFNZERO - Page has zero PFN - PAGE_IS_HUGE - Page is THP or Hugetlb backed This IOCTL can be extended to get information about more PTE bits. The entire address range passed by user [start, end) is scanned until either the user provided buffer is full or max_pages have been found. [akpm@linux-foundation.org: update it for "mm: hugetlb: add huge page size param to set_huge_pte_at()"] [akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n warning] [arnd@arndb.de: hide unused pagemap_scan_backout_range() function] Link: https://lkml.kernel.org/r/20230927060257.2975412-1-arnd@kernel.org [sfr@canb.auug.org.au: fix "fs/proc/task_mmu: hide unused pagemap_scan_backout_range() function"] Link: https://lkml.kernel.org/r/20230928092223.0625c6bf@canb.auug.org.au Link: https://lkml.kernel.org/r/20230821141518.870589-3-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Michał Mirosław Signed-off-by: Arnd Bergmann Signed-off-by: Stephen Rothwell Reviewed-by: Andrei Vagin Reviewed-by: Michał Mirosław Cc: Alex Sierra Cc: Al Viro Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 692 ++++++++++++++++++++++++++++++++++++++++++ include/linux/hugetlb.h | 1 + include/linux/userfaultfd_k.h | 7 + include/uapi/linux/fs.h | 59 ++++ mm/hugetlb.c | 5 +- 5 files changed, 762 insertions(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3dd5be96691b4c..d4ef9a2bf95dbb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -1761,11 +1763,701 @@ static int pagemap_release(struct inode *inode, struct file *file) return 0; } +#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ + PAGE_IS_FILE | PAGE_IS_PRESENT | \ + PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ + PAGE_IS_HUGE) +#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) + +struct pagemap_scan_private { + struct pm_scan_arg arg; + unsigned long masks_of_interest, cur_vma_category; + struct page_region *vec_buf; + unsigned long vec_buf_len, vec_buf_index, found_pages; + struct page_region __user *vec_out; +}; + +static unsigned long pagemap_page_category(struct pagemap_scan_private *p, + struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + unsigned long categories = 0; + + if (pte_present(pte)) { + struct page *page; + + categories |= PAGE_IS_PRESENT; + if (!pte_uffd_wp(pte)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + page = vm_normal_page(vma, addr, pte); + if (page && !PageAnon(page)) + categories |= PAGE_IS_FILE; + } + + if (is_zero_pfn(pte_pfn(pte))) + categories |= PAGE_IS_PFNZERO; + } else if (is_swap_pte(pte)) { + swp_entry_t swp; + + categories |= PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + swp = pte_to_swp_entry(pte); + if (is_pfn_swap_entry(swp) && + !PageAnon(pfn_swap_entry_to_page(swp))) + categories |= PAGE_IS_FILE; + } + } + + return categories; +} + +static void make_uffd_wp_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ + pte_t ptent = ptep_get(pte); + + if (pte_present(ptent)) { + pte_t old_pte; + + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_mkuffd_wp(ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_mkuffd_wp(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); + } else { + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + } +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, + struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + unsigned long categories = PAGE_IS_HUGE; + + if (pmd_present(pmd)) { + struct page *page; + + categories |= PAGE_IS_PRESENT; + if (!pmd_uffd_wp(pmd)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + page = vm_normal_page_pmd(vma, addr, pmd); + if (page && !PageAnon(page)) + categories |= PAGE_IS_FILE; + } + + if (is_zero_pfn(pmd_pfn(pmd))) + categories |= PAGE_IS_PFNZERO; + } else if (is_swap_pmd(pmd)) { + swp_entry_t swp; + + categories |= PAGE_IS_SWAPPED; + if (!pmd_swp_uffd_wp(pmd)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + swp = pmd_to_swp_entry(pmd); + if (is_pfn_swap_entry(swp) && + !PageAnon(pfn_swap_entry_to_page(swp))) + categories |= PAGE_IS_FILE; + } + } + + return categories; +} + +static void make_uffd_wp_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old, pmd = *pmdp; + + if (pmd_present(pmd)) { + old = pmdp_invalidate_ad(vma, addr, pmdp); + pmd = pmd_mkuffd_wp(old); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long pagemap_hugetlb_category(pte_t pte) +{ + unsigned long categories = PAGE_IS_HUGE; + + /* + * According to pagemap_hugetlb_range(), file-backed HugeTLB + * page cannot be swapped. So PAGE_IS_FILE is not checked for + * swapped pages. + */ + if (pte_present(pte)) { + categories |= PAGE_IS_PRESENT; + if (!huge_pte_uffd_wp(pte)) + categories |= PAGE_IS_WRITTEN; + if (!PageAnon(pte_page(pte))) + categories |= PAGE_IS_FILE; + if (is_zero_pfn(pte_pfn(pte))) + categories |= PAGE_IS_PFNZERO; + } else if (is_swap_pte(pte)) { + categories |= PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) + categories |= PAGE_IS_WRITTEN; + } + + return categories; +} + +static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t ptent) +{ + unsigned long psize; + + if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) + return; + + psize = huge_page_size(hstate_vma(vma)); + + if (is_hugetlb_entry_migration(ptent)) + set_huge_pte_at(vma->vm_mm, addr, ptep, + pte_swp_mkuffd_wp(ptent), psize); + else if (!huge_pte_none(ptent)) + huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, + huge_pte_mkuffd_wp(ptent)); + else + set_huge_pte_at(vma->vm_mm, addr, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP), psize); +} +#endif /* CONFIG_HUGETLB_PAGE */ + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) +static void pagemap_scan_backout_range(struct pagemap_scan_private *p, + unsigned long addr, unsigned long end) +{ + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + + if (cur_buf->start != addr) + cur_buf->end = addr; + else + cur_buf->start = cur_buf->end = 0; + + p->found_pages -= (end - addr) / PAGE_SIZE; +} +#endif + +static bool pagemap_scan_is_interesting_page(unsigned long categories, + const struct pagemap_scan_private *p) +{ + categories ^= p->arg.category_inverted; + if ((categories & p->arg.category_mask) != p->arg.category_mask) + return false; + if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) + return false; + + return true; +} + +static bool pagemap_scan_is_interesting_vma(unsigned long categories, + const struct pagemap_scan_private *p) +{ + unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; + + categories ^= p->arg.category_inverted; + if ((categories & required) != required) + return false; + + return true; +} + +static int pagemap_scan_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long vma_category = 0; + + if (userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma)) + vma_category |= PAGE_IS_WPALLOWED; + else if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) + return -EPERM; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + if (!pagemap_scan_is_interesting_vma(vma_category, p)) + return 1; + + p->cur_vma_category = vma_category; + + return 0; +} + +static bool pagemap_scan_push_range(unsigned long categories, + struct pagemap_scan_private *p, + unsigned long addr, unsigned long end) +{ + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + + /* + * When there is no output buffer provided at all, the sentinel values + * won't match here. There is no other way for `cur_buf->end` to be + * non-zero other than it being non-empty. + */ + if (addr == cur_buf->end && categories == cur_buf->categories) { + cur_buf->end = end; + return true; + } + + if (cur_buf->end) { + if (p->vec_buf_index >= p->vec_buf_len - 1) + return false; + + cur_buf = &p->vec_buf[++p->vec_buf_index]; + } + + cur_buf->start = addr; + cur_buf->end = end; + cur_buf->categories = categories; + + return true; +} + +static int pagemap_scan_output(unsigned long categories, + struct pagemap_scan_private *p, + unsigned long addr, unsigned long *end) +{ + unsigned long n_pages, total_pages; + int ret = 0; + + if (!p->vec_buf) + return 0; + + categories &= p->arg.return_mask; + + n_pages = (*end - addr) / PAGE_SIZE; + if (check_add_overflow(p->found_pages, n_pages, &total_pages) || + total_pages > p->arg.max_pages) { + size_t n_too_much = total_pages - p->arg.max_pages; + *end -= n_too_much * PAGE_SIZE; + n_pages -= n_too_much; + ret = -ENOSPC; + } + + if (!pagemap_scan_push_range(categories, p, addr, *end)) { + *end = addr; + n_pages = 0; + ret = -ENOSPC; + } + + p->found_pages += n_pages; + if (ret) + p->arg.walk_end = *end; + + return ret; +} + +static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long categories; + spinlock_t *ptl; + int ret = 0; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return -ENOENT; + + categories = p->cur_vma_category | + pagemap_thp_category(p, vma, start, *pmd); + + if (!pagemap_scan_is_interesting_page(categories, p)) + goto out_unlock; + + ret = pagemap_scan_output(categories, p, start, &end); + if (start == end) + goto out_unlock; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + goto out_unlock; + if (~categories & PAGE_IS_WRITTEN) + goto out_unlock; + + /* + * Break huge page into small pages if the WP operation + * needs to be performed on a portion of the huge page. + */ + if (end != start + HPAGE_SIZE) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, start); + pagemap_scan_backout_range(p, start, end); + /* Report as if there was no THP */ + return -ENOENT; + } + + make_uffd_wp_pmd(vma, start, pmd); + flush_tlb_range(vma, start, end); +out_unlock: + spin_unlock(ptl); + return ret; +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ + return -ENOENT; +#endif +} + +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long addr, flush_end = 0; + pte_t *pte, *start_pte; + spinlock_t *ptl; + int ret; + + arch_enter_lazy_mmu_mode(); + + ret = pagemap_scan_thp_entry(pmd, start, end, walk); + if (ret != -ENOENT) { + arch_leave_lazy_mmu_mode(); + return ret; + } + + ret = 0; + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + if (!pte) { + arch_leave_lazy_mmu_mode(); + walk->action = ACTION_AGAIN; + return 0; + } + + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { + unsigned long categories = p->cur_vma_category | + pagemap_page_category(p, vma, addr, ptep_get(pte)); + unsigned long next = addr + PAGE_SIZE; + + if (!pagemap_scan_is_interesting_page(categories, p)) + continue; + + ret = pagemap_scan_output(categories, p, addr, &next); + if (next == addr) + break; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + continue; + if (~categories & PAGE_IS_WRITTEN) + continue; + + make_uffd_wp_pte(vma, addr, pte); + if (!flush_end) + start = addr; + flush_end = next; + } + + if (flush_end) + flush_tlb_range(vma, start, addr); + + pte_unmap_unlock(start_pte, ptl); + arch_leave_lazy_mmu_mode(); + + cond_resched(); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long categories; + spinlock_t *ptl; + int ret = 0; + pte_t pte; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) { + /* Go the short route when not write-protecting pages. */ + + pte = huge_ptep_get(ptep); + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); + + if (!pagemap_scan_is_interesting_page(categories, p)) + return 0; + + return pagemap_scan_output(categories, p, start, &end); + } + + i_mmap_lock_write(vma->vm_file->f_mapping); + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); + + pte = huge_ptep_get(ptep); + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); + + if (!pagemap_scan_is_interesting_page(categories, p)) + goto out_unlock; + + ret = pagemap_scan_output(categories, p, start, &end); + if (start == end) + goto out_unlock; + + if (~categories & PAGE_IS_WRITTEN) + goto out_unlock; + + if (end != start + HPAGE_SIZE) { + /* Partial HugeTLB page WP isn't possible. */ + pagemap_scan_backout_range(p, start, end); + p->arg.walk_end = start; + ret = 0; + goto out_unlock; + } + + make_uffd_wp_huge_pte(vma, start, ptep, pte); + flush_hugetlb_tlb_range(vma, start, end); + +out_unlock: + spin_unlock(ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); + + return ret; +} +#else +#define pagemap_scan_hugetlb_entry NULL +#endif + +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, + int depth, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + int ret, err; + + if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) + return 0; + + ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); + if (addr == end) + return ret; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + return ret; + + err = uffd_wp_range(vma, addr, end - addr, true); + if (err < 0) + ret = err; + + return ret; +} + +static const struct mm_walk_ops pagemap_scan_ops = { + .test_walk = pagemap_scan_test_walk, + .pmd_entry = pagemap_scan_pmd_entry, + .pte_hole = pagemap_scan_pte_hole, + .hugetlb_entry = pagemap_scan_hugetlb_entry, +}; + +static int pagemap_scan_get_args(struct pm_scan_arg *arg, + unsigned long uarg) +{ + if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) + return -EFAULT; + + if (arg->size != sizeof(struct pm_scan_arg)) + return -EINVAL; + + /* Validate requested features */ + if (arg->flags & ~PM_SCAN_FLAGS) + return -EINVAL; + if ((arg->category_inverted | arg->category_mask | + arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) + return -EINVAL; + + arg->start = untagged_addr((unsigned long)arg->start); + arg->end = untagged_addr((unsigned long)arg->end); + arg->vec = untagged_addr((unsigned long)arg->vec); + + /* Validate memory pointers */ + if (!IS_ALIGNED(arg->start, PAGE_SIZE)) + return -EINVAL; + if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) + return -EFAULT; + if (!arg->vec && arg->vec_len) + return -EINVAL; + if (arg->vec && !access_ok((void __user *)(long)arg->vec, + arg->vec_len * sizeof(struct page_region))) + return -EFAULT; + + /* Fixup default values */ + arg->end = ALIGN(arg->end, PAGE_SIZE); + arg->walk_end = 0; + if (!arg->max_pages) + arg->max_pages = ULONG_MAX; + + return 0; +} + +static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, + unsigned long uargl) +{ + struct pm_scan_arg __user *uarg = (void __user *)uargl; + + if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) + return -EFAULT; + + return 0; +} + +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) +{ + if (!p->arg.vec_len) + return 0; + + p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, + p->arg.vec_len); + p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), + GFP_KERNEL); + if (!p->vec_buf) + return -ENOMEM; + + p->vec_buf->start = p->vec_buf->end = 0; + p->vec_out = (struct page_region __user *)(long)p->arg.vec; + + return 0; +} + +static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) +{ + const struct page_region *buf = p->vec_buf; + long n = p->vec_buf_index; + + if (!p->vec_buf) + return 0; + + if (buf[n].end != buf[n].start) + n++; + + if (!n) + return 0; + + if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) + return -EFAULT; + + p->arg.vec_len -= n; + p->vec_out += n; + + p->vec_buf_index = 0; + p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); + p->vec_buf->start = p->vec_buf->end = 0; + + return n; +} + +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) +{ + struct mmu_notifier_range range; + struct pagemap_scan_private p = {0}; + unsigned long walk_start; + size_t n_ranges_out = 0; + int ret; + + ret = pagemap_scan_get_args(&p.arg, uarg); + if (ret) + return ret; + + p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | + p.arg.return_mask; + ret = pagemap_scan_init_bounce_buffer(&p); + if (ret) + return ret; + + /* Protection change for the range is going to happen. */ + if (p.arg.flags & PM_SCAN_WP_MATCHING) { + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, + mm, p.arg.start, p.arg.end); + mmu_notifier_invalidate_range_start(&range); + } + + for (walk_start = p.arg.start; walk_start < p.arg.end; + walk_start = p.arg.walk_end) { + long n_out; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + ret = mmap_read_lock_killable(mm); + if (ret) + break; + ret = walk_page_range(mm, walk_start, p.arg.end, + &pagemap_scan_ops, &p); + mmap_read_unlock(mm); + + n_out = pagemap_scan_flush_buffer(&p); + if (n_out < 0) + ret = n_out; + else + n_ranges_out += n_out; + + if (ret != -ENOSPC) + break; + + if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) + break; + } + + /* ENOSPC signifies early stop (buffer full) from the walk. */ + if (!ret || ret == -ENOSPC) + ret = n_ranges_out; + + /* The walk_end isn't set when ret is zero */ + if (!p.arg.walk_end) + p.arg.walk_end = p.arg.end; + if (pagemap_scan_writeback_args(&p.arg, uarg)) + ret = -EFAULT; + + if (p.arg.flags & PM_SCAN_WP_MATCHING) + mmu_notifier_invalidate_range_end(&range); + + kfree(p.vec_buf); + return ret; +} + +static long do_pagemap_cmd(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct mm_struct *mm = file->private_data; + + switch (cmd) { + case PAGEMAP_SCAN: + return do_pagemap_scan(mm, arg); + + default: + return -EINVAL; + } +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, + .unlocked_ioctl = do_pagemap_cmd, + .compat_ioctl = do_pagemap_cmd, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e5b9f7e62eeb7f..205469aa061348 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -280,6 +280,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long cp_flags); bool is_hugetlb_entry_migration(pte_t pte); +bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c98df391bfd8f5..f2dc19f40d0596 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -221,6 +221,13 @@ static inline vm_fault_t handle_userfault(struct vm_fault *vmf, return VM_FAULT_SIGBUS; } +static inline long uffd_wp_range(struct vm_area_struct *vma, + unsigned long start, unsigned long len, + bool enable_wp) +{ + return false; +} + static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index b7b56871029c58..da43810b74856b 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -305,4 +305,63 @@ typedef int __bitwise __kernel_rwf_t; #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND) +/* Pagemap ioctl */ +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) + +/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ +#define PAGE_IS_WPALLOWED (1 << 0) +#define PAGE_IS_WRITTEN (1 << 1) +#define PAGE_IS_FILE (1 << 2) +#define PAGE_IS_PRESENT (1 << 3) +#define PAGE_IS_SWAPPED (1 << 4) +#define PAGE_IS_PFNZERO (1 << 5) +#define PAGE_IS_HUGE (1 << 6) + +/* + * struct page_region - Page region with flags + * @start: Start of the region + * @end: End of the region (exclusive) + * @categories: PAGE_IS_* category bitmask for the region + */ +struct page_region { + __u64 start; + __u64 end; + __u64 categories; +}; + +/* Flags for PAGEMAP_SCAN ioctl */ +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ + +/* + * struct pm_scan_arg - Pagemap ioctl argument + * @size: Size of the structure + * @flags: Flags for the IOCTL + * @start: Starting address of the region + * @end: Ending address of the region + * @walk_end Address where the scan stopped (written by kernel). + * walk_end == end (address tags cleared) informs that the scan completed on entire range. + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional limit for number of returned pages (0 = disabled) + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 + * @category_mask: Skip pages for which any category doesn't match + * @category_anyof_mask: Skip pages for which no category matches + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned + */ +struct pm_scan_arg { + __u64 size; + __u64 flags; + __u64 start; + __u64 end; + __u64 walk_end; + __u64 vec; + __u64 vec_len; + __u64 max_pages; + __u64 category_inverted; + __u64 category_mask; + __u64 category_anyof_mask; + __u64 return_mask; +}; + #endif /* _UAPI_LINUX_FS_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc654b36df9f97..2878e0e6bac5c8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5044,7 +5044,7 @@ bool is_hugetlb_entry_migration(pte_t pte) return false; } -static bool is_hugetlb_entry_hwpoisoned(pte_t pte) +bool is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; @@ -6266,7 +6266,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } entry = huge_pte_clear_uffd_wp(entry); - set_huge_pte_at(mm, haddr, ptep, entry); + set_huge_pte_at(mm, haddr, ptep, entry, + huge_page_size(hstate_vma(vma))); /* Fallthrough to CoW */ } -- cgit 1.2.3-korg From 5ca432896a4ce6d69fffc3298b24c0dd9bdb871f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:47 +0200 Subject: mm/rmap: move SetPageAnonExclusive() out of page_move_anon_rmap() Patch series "mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap()". Convert page_move_anon_rmap() to folio_move_anon_rmap(), letting the callers handle PageAnonExclusive. I'm including cleanup patch #3 because it fits into the picture and can be done cleaner by the conversion. This patch (of 3): Let's move it into the caller: there is a difference between whether an anon folio can only be mapped by one process (e.g., into one VMA), and whether it is truly exclusive (e.g., no references -- including GUP -- from other processes). Further, for large folios the page might not actually be pointing at the head page of the folio, so it better be handled in the caller. This is a preparation for converting page_move_anon_rmap() to consume a folio. Link: https://lkml.kernel.org/r/20231002142949.235104-1-david@redhat.com Link: https://lkml.kernel.org/r/20231002142949.235104-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Reviewed-by: Vishal Moola (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/huge_memory.c | 1 + mm/hugetlb.c | 4 +++- mm/memory.c | 1 + mm/rmap.c | 1 - 4 files changed, 5 insertions(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index aa022455613261..76ead290f1c82b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1377,6 +1377,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) pmd_t entry; page_move_anon_rmap(page, vma); + SetPageAnonExclusive(page); folio_unlock(folio); reuse: if (unlikely(unshare)) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2878e0e6bac5c8..35d924f7497231 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5652,8 +5652,10 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * owner and can reuse this page. */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { - if (!PageAnonExclusive(&old_folio->page)) + if (!PageAnonExclusive(&old_folio->page)) { page_move_anon_rmap(&old_folio->page, vma); + SetPageAnonExclusive(&old_folio->page); + } if (likely(!unshare)) set_huge_ptep_writable(vma, haddr, ptep); diff --git a/mm/memory.c b/mm/memory.c index ceffe96f2a0e33..21fba1c9a6c730 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3481,6 +3481,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * sunglasses. Hit it. */ page_move_anon_rmap(vmf->page, vma); + SetPageAnonExclusive(vmf->page); folio_unlock(folio); reuse: if (unlikely(unshare)) { diff --git a/mm/rmap.c b/mm/rmap.c index c6bb2339e35b26..6f1ea1491118c5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1152,7 +1152,6 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) * folio_test_anon()) will not see one without the other. */ WRITE_ONCE(folio->mapping, anon_vma); - SetPageAnonExclusive(page); } /** -- cgit 1.2.3-korg From 069686255c16a75b6a796e42df47f5af27b496a4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:48 +0200 Subject: mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap() Let's convert it to consume a folio. [akpm@linux-foundation.org: fix kerneldoc] Link: https://lkml.kernel.org/r/20231002142949.235104-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Reviewed-by: Vishal Moola (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- mm/huge_memory.c | 2 +- mm/hugetlb.c | 2 +- mm/memory.c | 2 +- mm/rmap.c | 16 +++++++--------- 5 files changed, 11 insertions(+), 13 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d22f4d21a11ca7..b26fe858fd444c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -189,7 +189,7 @@ typedef int __bitwise rmap_t; /* * rmap interfaces called when adding or removing pte of page */ -void page_move_anon_rmap(struct page *, struct vm_area_struct *); +void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 76ead290f1c82b..51a66eb48938af 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1376,7 +1376,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) if (folio_ref_count(folio) == 1) { pmd_t entry; - page_move_anon_rmap(page, vma); + folio_move_anon_rmap(folio, vma); SetPageAnonExclusive(page); folio_unlock(folio); reuse: diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 35d924f7497231..7ad9d2159da426 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5653,7 +5653,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { if (!PageAnonExclusive(&old_folio->page)) { - page_move_anon_rmap(&old_folio->page, vma); + folio_move_anon_rmap(old_folio, vma); SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) diff --git a/mm/memory.c b/mm/memory.c index 21fba1c9a6c730..6c67828f934c57 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3480,7 +3480,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * and the folio is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ - page_move_anon_rmap(vmf->page, vma); + folio_move_anon_rmap(folio, vma); SetPageAnonExclusive(vmf->page); folio_unlock(folio); reuse: diff --git a/mm/rmap.c b/mm/rmap.c index 6f1ea1491118c5..7a27a2b4180210 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1128,19 +1128,17 @@ int folio_total_mapcount(struct folio *folio) } /** - * page_move_anon_rmap - move a page to our anon_vma - * @page: the page to move to our anon_vma - * @vma: the vma the page belongs to + * folio_move_anon_rmap - move a folio to our anon_vma + * @folio: The folio to move to our anon_vma + * @vma: The vma the folio belongs to * - * When a page belongs exclusively to one process after a COW event, - * that page can be moved into the anon_vma that belongs to just that - * process, so the rmap code will not search the parent or sibling - * processes. + * When a folio belongs exclusively to one process after a COW event, + * that folio can be moved into the anon_vma that belongs to just that + * process, so the rmap code will not search the parent or sibling processes. */ -void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) +void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma) { void *anon_vma = vma->anon_vma; - struct folio *folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); -- cgit 1.2.3-korg From 59838b2566f6d03099743675a2e0f425813078c6 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Wed, 4 Oct 2023 15:32:48 +0000 Subject: mm, hugetlb: remove HUGETLB_CGROUP_MIN_ORDER Originally, hugetlb_cgroup was the only hugetlb user of tail page structure fields. So, the code defined and checked against HUGETLB_CGROUP_MIN_ORDER to make sure pages weren't too small to use. However, by now, tail page #2 is used to store hugetlb hwpoison and subpool information as well. In other words, without that tail page hugetlb doesn't work. Acknowledge this fact by getting rid of HUGETLB_CGROUP_MIN_ORDER and checks against it. Instead, just check for the minimum viable page order at hstate creation time. Link: https://lkml.kernel.org/r/20231004153248.3842997-1-fvdl@google.com Signed-off-by: Frank van der Linden Reviewed-by: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 11 ----------- mm/hugetlb.c | 2 +- mm/hugetlb_cgroup.c | 20 ++------------------ 3 files changed, 3 insertions(+), 30 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 3d82d91f49acb0..e5d64b8b59c202 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -22,13 +22,6 @@ struct resv_map; struct file_region; #ifdef CONFIG_CGROUP_HUGETLB -/* - * Minimum page order trackable by hugetlb cgroup. - * At least 3 pages are necessary for all the tracking information. - * The second tail page contains all of the hugetlb-specific fields. - */ -#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__NR_USED_SUBPAGE) - enum hugetlb_memory_event { HUGETLB_MAX, HUGETLB_NR_MEMORY_EVENTS, @@ -68,8 +61,6 @@ static inline struct hugetlb_cgroup * __hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); - if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) - return NULL; if (rsvd) return folio->_hugetlb_cgroup_rsvd; else @@ -91,8 +82,6 @@ static inline void __set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg, bool rsvd) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); - if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) - return; if (rsvd) folio->_hugetlb_cgroup_rsvd = h_cg; else diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7ad9d2159da426..e2b1c417b90ae4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4361,7 +4361,7 @@ void __init hugetlb_add_hstate(unsigned int order) return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); - BUG_ON(order == 0); + BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); h = &hstates[hugetlb_max_hstate++]; mutex_init(&h->resize_lock); h->order = order; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index dedd2edb076ec2..aa4486bd390493 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -262,12 +262,6 @@ static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled()) goto done; - /* - * We don't charge any cgroup if the compound page have less - * than 3 pages. - */ - if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) - goto done; again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); @@ -397,9 +391,6 @@ static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled() || !h_cg) return; - if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) - return; - page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); @@ -869,15 +860,8 @@ void __init hugetlb_cgroup_file_init(void) { struct hstate *h; - for_each_hstate(h) { - /* - * Add cgroup control files only if the huge page consists - * of more than two normal pages. This is because we use - * page[2].private for storing cgroup details. - */ - if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) - __hugetlb_cgroup_file_init(hstate_index(h)); - } + for_each_hstate(h) + __hugetlb_cgroup_file_init(hstate_index(h)); } /* -- cgit 1.2.3-korg From 8cba9576df601c384abd334a503c3f6e1e29eefb Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:28 -0700 Subject: hugetlb: memcg: account hugetlb-backed memory in memory controller Currently, hugetlb memory usage is not acounted for in the memory controller, which could lead to memory overprotection for cgroups with hugetlb-backed memory. This has been observed in our production system. For instance, here is one of our usecases: suppose there are two 32G containers. The machine is booted with hugetlb_cma=6G, and each container may or may not use up to 3 gigantic page, depending on the workload within it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup limit of each cgroup to 3G to enforce hugetlb fairness. But it is very difficult to configure memory.max to keep overall consumption, including anon, cache, slab etc. fair. What we have had to resort to is to constantly poll hugetlb usage and readjust memory.max. Similar procedure is done to other memory limits (memory.low for e.g). However, this is rather cumbersome and buggy. Furthermore, when there is a delay in memory limits correction, (for e.g when hugetlb usage changes within consecutive runs of the userspace agent), the system could be in an over/underprotected state. This patch rectifies this issue by charging the memcg when the hugetlb folio is utilized, and uncharging when the folio is freed (analogous to the hugetlb controller). Note that we do not charge when the folio is allocated to the hugetlb pool, because at this point it is not owned by any memcg. Some caveats to consider: * This feature is only available on cgroup v2. * There is no hugetlb pool management involved in the memory controller. As stated above, hugetlb folios are only charged towards the memory controller when it is used. Host overcommit management has to consider it when configuring hard limits. * Failure to charge towards the memcg results in SIGBUS. This could happen even if the hugetlb pool still has pages (but the cgroup limit is hit and reclaim attempt fails). * When this feature is enabled, hugetlb pages contribute to memory reclaim protection. low, min limits tuning must take into account hugetlb memory. * Hugetlb pages utilized while this option is not selected will not be tracked by the memory controller (even if cgroup v2 is remounted later on). Link: https://lkml.kernel.org/r/20231006184629.155543-4-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Cc: Frank van der Linden Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 29 +++++++++++++++++++++++ include/linux/cgroup-defs.h | 5 ++++ include/linux/memcontrol.h | 9 +++++++ kernel/cgroup/cgroup.c | 15 +++++++++++- mm/hugetlb.c | 35 +++++++++++++++++++++------ mm/memcontrol.c | 42 ++++++++++++++++++++++++++++++++- mm/migrate.c | 3 +-- 7 files changed, 127 insertions(+), 11 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 622a7f28db1fd5..606b2e0eac4b17 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -210,6 +210,35 @@ cgroup v2 currently supports the following mount options. relying on the original semantics (e.g. specifying bogusly high 'bypass' protection values at higher tree levels). + memory_hugetlb_accounting + Count HugeTLB memory usage towards the cgroup's overall + memory usage for the memory controller (for the purpose of + statistics reporting and memory protetion). This is a new + behavior that could regress existing setups, so it must be + explicitly opted in with this mount option. + + A few caveats to keep in mind: + + * There is no HugeTLB pool management involved in the memory + controller. The pre-allocated pool does not belong to anyone. + Specifically, when a new HugeTLB folio is allocated to + the pool, it is not accounted for from the perspective of the + memory controller. It is only charged to a cgroup when it is + actually used (for e.g at page fault time). Host memory + overcommit management has to consider this when configuring + hard limits. In general, HugeTLB pool management should be + done via other mechanisms (such as the HugeTLB controller). + * Failure to charge a HugeTLB folio to the memory controller + results in SIGBUS. This could happen even if the HugeTLB pool + still has pages available (but the cgroup limit is hit and + reclaim attempt fails). + * Charging HugeTLB memory towards the memory controller affects + memory protection and reclaim dynamics. Any userspace tuning + (of low, min limits for e.g) needs to take this into account. + * HugeTLB pages utilized while this option is not selected + will not be tracked by the memory controller (even if cgroup + v2 is remounted later on). + Organizing Processes and Threads -------------------------------- diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index f1b3151ac30bf0..8641f4320c9803 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -115,6 +115,11 @@ enum { * Enable recursive subtree protection */ CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18), + + /* + * Enable hugetlb accounting for the memory controller. + */ + CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), }; /* cftype->flags */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a3fce570122ec7..6674c12725d56c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -678,6 +678,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, return __mem_cgroup_charge(folio, mm, gfp); } +int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, + long nr_pages); + int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); @@ -1258,6 +1261,12 @@ static inline int mem_cgroup_charge(struct folio *folio, return 0; } +static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, + gfp_t gfp, long nr_pages) +{ + return 0; +} + static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1fb7f562289d53..f11488b18ceb2c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1902,6 +1902,7 @@ enum cgroup2_param { Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, + Opt_memory_hugetlb_accounting, nr__cgroup2_params }; @@ -1910,6 +1911,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), + fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), {} }; @@ -1936,6 +1938,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; + case Opt_memory_hugetlb_accounting: + ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + return 0; } return -EINVAL; } @@ -1960,6 +1965,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; + + if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; } } @@ -1973,6 +1983,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) seq_puts(seq, ",memory_recursiveprot"); + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + seq_puts(seq, ",memory_hugetlb_accounting"); return 0; } @@ -7050,7 +7062,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, "nsdelegate\n" "favordynmods\n" "memory_localevents\n" - "memory_recursiveprot\n"); + "memory_recursiveprot\n" + "memory_hugetlb_accounting\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e2b1c417b90ae4..da6f85b7db887f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1927,6 +1927,7 @@ void free_huge_folio(struct folio *folio) pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); + mem_cgroup_uncharge(folio); if (restore_reserve) h->resv_huge_pages++; @@ -3026,11 +3027,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit; + long map_chg, map_commit, nr_pages = pages_per_huge_page(h); long gbl_chg; - int ret, idx; + int memcg_charge_ret, ret, idx; struct hugetlb_cgroup *h_cg = NULL; + struct mem_cgroup *memcg; bool deferred_reserve; + gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; + + memcg = get_mem_cgroup_from_current(); + memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); + if (memcg_charge_ret == -ENOMEM) { + mem_cgroup_put(memcg); + return ERR_PTR(-ENOMEM); + } idx = hstate_index(h); /* @@ -3039,8 +3049,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * code of zero indicates a reservation exists (no change). */ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) + if (map_chg < 0) { + if (!memcg_charge_ret) + mem_cgroup_cancel_charge(memcg, nr_pages); + mem_cgroup_put(memcg); return ERR_PTR(-ENOMEM); + } /* * Processes that did not create the mapping will have no @@ -3051,10 +3065,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg || avoid_reserve) { gbl_chg = hugepage_subpool_get_pages(spool, 1); - if (gbl_chg < 0) { - vma_end_reservation(h, vma, addr); - return ERR_PTR(-ENOSPC); - } + if (gbl_chg < 0) + goto out_end_reservation; /* * Even though there was no reservation in the region/reserve @@ -3136,6 +3148,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } + + if (!memcg_charge_ret) + mem_cgroup_commit_charge(folio, memcg); + mem_cgroup_put(memcg); + return folio; out_uncharge_cgroup: @@ -3147,7 +3164,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, out_subpool_put: if (map_chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); +out_end_reservation: vma_end_reservation(h, vma, addr); + if (!memcg_charge_ret) + mem_cgroup_cancel_charge(memcg, nr_pages); + mem_cgroup_put(memcg); return ERR_PTR(-ENOSPC); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 135d6637afe552..a86e7b44580081 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7096,6 +7096,41 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) return ret; } +/** + * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio + * @memcg: memcg to charge. + * @gfp: reclaim mode. + * @nr_pages: number of pages to charge. + * + * This function is called when allocating a huge page folio to determine if + * the memcg has the capacity for it. It does not commit the charge yet, + * as the hugetlb folio itself has not been obtained from the hugetlb pool. + * + * Once we have obtained the hugetlb folio, we can call + * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the + * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect + * of try_charge(). + * + * Returns 0 on success. Otherwise, an error code is returned. + */ +int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, + long nr_pages) +{ + /* + * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation, + * but do not attempt to commit charge later (or cancel on error) either. + */ + if (mem_cgroup_disabled() || !memcg || + !cgroup_subsys_on_dfl(memory_cgrp_subsys) || + !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) + return -EOPNOTSUPP; + + if (try_charge(memcg, gfp, nr_pages)) + return -ENOMEM; + + return 0; +} + /** * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. * @folio: folio to charge. @@ -7365,7 +7400,12 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) return; memcg = folio_memcg(old); - VM_WARN_ON_ONCE_FOLIO(!memcg, old); + /* + * Note that it is normal to see !memcg for a hugetlb folio. + * For e.g, itt could have been allocated when memory_hugetlb_accounting + * was not selected. + */ + VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); if (!memcg) return; diff --git a/mm/migrate.c b/mm/migrate.c index 9d9eee5322d191..c602bf6dec9783 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -633,8 +633,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_copy_owner(newfolio, folio); - if (!folio_test_hugetlb(folio)) - mem_cgroup_migrate(folio, newfolio); + mem_cgroup_migrate(folio, newfolio); } EXPORT_SYMBOL(folio_migrate_flags); -- cgit 1.2.3-korg From d2cf88c27f51361dfe2982e510a444411d2fc78a Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:03 -0700 Subject: hugetlb: optimize update_and_free_pages_bulk to avoid lock cycles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Batch hugetlb vmemmap modification operations", v8. When hugetlb vmemmap optimization was introduced, the overhead of enabling the option was measured as described in commit 426e5c429d16 [1]. The summary states that allocating a hugetlb page should be ~2x slower with optimization and freeing a hugetlb page should be ~2-3x slower. Such overhead was deemed an acceptable trade off for the memory savings obtained by freeing vmemmap pages. It was recently reported that the overhead associated with enabling vmemmap optimization could be as high as 190x for hugetlb page allocations. Yes, 190x! Some actual numbers from other environments are: Bare Metal 8 socket Intel(R) Xeon(R) CPU E7-8895 ------------------------------------------------ Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 0 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m4.119s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m4.477s Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 1 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m28.973s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m36.748s VM with 252 vcpus on host with 2 socket AMD EPYC 7J13 Milan ----------------------------------------------------------- Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 0 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 0m2.463s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m2.931s Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 1 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 2m27.609s time echo 0 > .../hugepages-2048kB/nr_hugepages real 2m29.924s In the VM environment, the slowdown of enabling hugetlb vmemmap optimization resulted in allocation times being 61x slower. A quick profile showed that the vast majority of this overhead was due to TLB flushing. Each time we modify the kernel pagetable we need to flush the TLB. For each hugetlb that is optimized, there could be potentially two TLB flushes performed. One for the vmemmap pages associated with the hugetlb page, and potentially another one if the vmemmap pages are mapped at the PMD level and must be split. The TLB flushes required for the kernel pagetable, result in a broadcast IPI with each CPU having to flush a range of pages, or do a global flush if a threshold is exceeded. So, the flush time increases with the number of CPUs. In addition, in virtual environments the broadcast IPI can’t be accelerated by hypervisor hardware and leads to traps that need to wakeup/IPI all vCPUs which is very expensive. Because of this the slowdown in virtual environments is even worse than bare metal as the number of vCPUS/CPUs is increased. The following series attempts to reduce amount of time spent in TLB flushing. The idea is to batch the vmemmap modification operations for multiple hugetlb pages. Instead of doing one or two TLB flushes for each page, we do two TLB flushes for each batch of pages. One flush after splitting pages mapped at the PMD level, and another after remapping vmemmap associated with all hugetlb pages. Results of such batching are as follows: Bare Metal 8 socket Intel(R) Xeon(R) CPU E7-8895 ------------------------------------------------ next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 0 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m4.719s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m4.245s next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 1 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m7.267s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m13.199s VM with 252 vcpus on host with 2 socket AMD EPYC 7J13 Milan ----------------------------------------------------------- next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 0 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 0m2.715s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m3.186s next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 1 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 0m4.799s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m5.273s With batching, results are back in the 2-3x slowdown range. This patch (of 8): update_and_free_pages_bulk is designed to free a list of hugetlb pages back to their associated lower level allocators. This may require allocating vmemmmap pages associated with each hugetlb page. The hugetlb page destructor must be changed before pages are freed to lower level allocators. However, the destructor must be changed under the hugetlb lock. This means there is potentially one lock cycle per page. Minimize the number of lock cycles in update_and_free_pages_bulk by: 1) allocating necessary vmemmap for all hugetlb pages on the list 2) take hugetlb lock and clear destructor for all pages on the list 3) free all pages on list back to low level allocators Link: https://lkml.kernel.org/r/20231019023113.345257-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20231019023113.345257-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Acked-by: James Houghton Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index da6f85b7db887f..b839080a2a6b92 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1862,7 +1862,46 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { struct folio *folio, *t_folio; + bool clear_dtor = false; + /* + * First allocate required vmemmmap (if necessary) for all folios on + * list. If vmemmap can not be allocated, we can not free folio to + * lower level allocator, so add back as hugetlb surplus page. + * add_hugetlb_folio() removes the page from THIS list. + * Use clear_dtor to note if vmemmap was successfully allocated for + * ANY page on the list. + */ + list_for_each_entry_safe(folio, t_folio, list, lru) { + if (folio_test_hugetlb_vmemmap_optimized(folio)) { + if (hugetlb_vmemmap_restore(h, &folio->page)) { + spin_lock_irq(&hugetlb_lock); + add_hugetlb_folio(h, folio, true); + spin_unlock_irq(&hugetlb_lock); + } else + clear_dtor = true; + } + } + + /* + * If vmemmmap allocation was performed on any folio above, take lock + * to clear destructor of all folios on list. This avoids the need to + * lock/unlock for each individual folio. + * The assumption is vmemmap allocation was performed on all or none + * of the folios on the list. This is true expect in VERY rare cases. + */ + if (clear_dtor) { + spin_lock_irq(&hugetlb_lock); + list_for_each_entry(folio, list, lru) + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + } + + /* + * Free folios back to low level allocators. vmemmap and destructors + * were taken care of above, so update_and_free_hugetlb_folio will + * not need to take hugetlb lock. + */ list_for_each_entry_safe(folio, t_folio, list, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); -- cgit 1.2.3-korg From d67e32f26713c35669e343edfdce6fa97a7d6bbc Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:04 -0700 Subject: hugetlb: restructure pool allocations Allocation of a hugetlb page for the hugetlb pool is done by the routine alloc_pool_huge_page. This routine will allocate contiguous pages from a low level allocator, prep the pages for usage as a hugetlb page and then add the resulting hugetlb page to the pool. In the 'prep' stage, optional vmemmap optimization is done. For performance reasons we want to perform vmemmap optimization on multiple hugetlb pages at once. To do this, restructure the hugetlb pool allocation code such that vmemmap optimization can be isolated and later batched. The code to allocate hugetlb pages from bootmem was also modified to allow batching. No functional changes, only code restructure. Link: https://lkml.kernel.org/r/20231019023113.345257-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Tested-by: Sergey Senozhatsky Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 141 insertions(+), 39 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b839080a2a6b92..559f7c71c5969a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1996,16 +1996,21 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) h->nr_huge_pages_node[nid]++; } -static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) +static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) { folio_set_hugetlb(folio); - hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); } +static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) +{ + init_new_hugetlb_folio(h, folio); + hugetlb_vmemmap_optimize(h, &folio->page); +} + static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) { __prep_new_hugetlb_folio(h, folio); @@ -2202,16 +2207,9 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, return page_folio(page); } -/* - * Common helper to allocate a fresh hugetlb page. All specific allocators - * should use this function to get new hugetlb pages - * - * Note that returned page is 'frozen': ref count of head page and all tail - * pages is zero. - */ -static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) +static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { struct folio *folio; bool retry = false; @@ -2224,6 +2222,7 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, nid, nmask, node_alloc_noretry); if (!folio) return NULL; + if (hstate_is_gigantic(h)) { if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { /* @@ -2238,32 +2237,81 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, return NULL; } } - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); return folio; } +static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) +{ + struct folio *folio; + + folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, + node_alloc_noretry); + if (folio) + init_new_hugetlb_folio(h, folio); + return folio; +} + /* - * Allocates a fresh page to the hugetlb allocator pool in the node interleaved - * manner. + * Common helper to allocate a fresh hugetlb page. All specific allocators + * should use this function to get new hugetlb pages + * + * Note that returned page is 'frozen': ref count of head page and all tail + * pages is zero. */ -static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, - nodemask_t *node_alloc_noretry) +static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { struct folio *folio; - int nr_nodes, node; + + folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, + node_alloc_noretry); + if (!folio) + return NULL; + + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + return folio; +} + +static void prep_and_add_allocated_folios(struct hstate *h, + struct list_head *folio_list) +{ + unsigned long flags; + struct folio *folio, *tmp_f; + + /* Add all new pool pages to free lists in one lock cycle */ + spin_lock_irqsave(&hugetlb_lock, flags); + list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { + __prep_account_new_huge_page(h, folio_nid(folio)); + enqueue_hugetlb_folio(h, folio); + } + spin_unlock_irqrestore(&hugetlb_lock, flags); +} + +/* + * Allocates a fresh hugetlb page in a node interleaved manner. The page + * will later be added to the appropriate hugetlb pool. + */ +static struct folio *alloc_pool_huge_folio(struct hstate *h, + nodemask_t *nodes_allowed, + nodemask_t *node_alloc_noretry) +{ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + int nr_nodes, node; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, + struct folio *folio; + + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); - if (folio) { - free_huge_folio(folio); /* free it into the hugepage allocator */ - return 1; - } + if (folio) + return folio; } - return 0; + return NULL; } /* @@ -3302,25 +3350,35 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, */ static void __init gather_bootmem_prealloc(void) { + LIST_HEAD(folio_list); struct huge_bootmem_page *m; + struct hstate *h = NULL, *prev_h = NULL; list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; - struct hstate *h = m->hstate; + + h = m->hstate; + /* + * It is possible to have multiple huge page sizes (hstates) + * in this list. If so, process each size separately. + */ + if (h != prev_h && prev_h != NULL) + prep_and_add_allocated_folios(prev_h, &folio_list); + prev_h = h; VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(folio_ref_count(folio) != 1); hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + __prep_new_hugetlb_folio(h, folio); /* If HVO fails, initialize all tail struct pages */ if (!HPageVmemmapOptimized(&folio->page)) hugetlb_folio_init_tail_vmemmap(folio, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); - free_huge_folio(folio); /* add to the hugepage allocator */ + list_add(&folio->lru, &folio_list); /* * We need to restore the 'stolen' pages to totalram_pages @@ -3330,6 +3388,8 @@ static void __init gather_bootmem_prealloc(void) adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } + + prep_and_add_allocated_folios(h, &folio_list); } static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) @@ -3363,9 +3423,22 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) h->max_huge_pages_node[nid] = i; } +/* + * NOTE: this routine is called in different contexts for gigantic and + * non-gigantic pages. + * - For gigantic pages, this is called early in the boot process and + * pages are allocated from memblock allocated or something similar. + * Gigantic pages are actually added to pools later with the routine + * gather_bootmem_prealloc. + * - For non-gigantic pages, this is called later in the boot process after + * all of mm is up and functional. Pages are allocated from buddy and + * then added to hugetlb pools. + */ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; + struct folio *folio; + LIST_HEAD(folio_list); nodemask_t *node_alloc_noretry; bool node_specific_alloc = false; @@ -3407,14 +3480,25 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { + /* + * gigantic pages not added to list as they are not + * added to pools now. + */ if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) break; - } else if (!alloc_pool_huge_page(h, - &node_states[N_MEMORY], - node_alloc_noretry)) - break; + } else { + folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], + node_alloc_noretry); + if (!folio) + break; + list_add(&folio->lru, &folio_list); + } cond_resched(); } + + /* list will be empty if hstate_is_gigantic */ + prep_and_add_allocated_folios(h, &folio_list); + if (i < h->max_huge_pages) { char buf[32]; @@ -3548,7 +3632,9 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { - unsigned long min_count, ret; + unsigned long min_count; + unsigned long allocated; + struct folio *folio; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); @@ -3625,7 +3711,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, break; } - while (count > persistent_huge_pages(h)) { + allocated = 0; + while (count > (persistent_huge_pages(h) + allocated)) { /* * If this allocation races such that we no longer need the * page, free_huge_folio will handle it by freeing the page @@ -3636,15 +3723,32 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, /* yield cpu to avoid soft lockup */ cond_resched(); - ret = alloc_pool_huge_page(h, nodes_allowed, + folio = alloc_pool_huge_folio(h, nodes_allowed, node_alloc_noretry); - spin_lock_irq(&hugetlb_lock); - if (!ret) + if (!folio) { + prep_and_add_allocated_folios(h, &page_list); + spin_lock_irq(&hugetlb_lock); goto out; + } + + list_add(&folio->lru, &page_list); + allocated++; /* Bail for signals. Probably ctrl-c from user */ - if (signal_pending(current)) + if (signal_pending(current)) { + prep_and_add_allocated_folios(h, &page_list); + spin_lock_irq(&hugetlb_lock); goto out; + } + + spin_lock_irq(&hugetlb_lock); + } + + /* Add allocated pages to the pool */ + if (!list_empty(&page_list)) { + spin_unlock_irq(&hugetlb_lock); + prep_and_add_allocated_folios(h, &page_list); + spin_lock_irq(&hugetlb_lock); } /* @@ -3670,8 +3774,6 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * Collect pages to be removed on list without dropping lock */ while (min_count < persistent_huge_pages(h)) { - struct folio *folio; - folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); if (!folio) break; -- cgit 1.2.3-korg From 79359d6d24df2f304abbf6f137b01d2b76175ce3 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:05 -0700 Subject: hugetlb: perform vmemmap optimization on a list of pages When adding hugetlb pages to the pool, we first create a list of the allocated pages before adding to the pool. Pass this list of pages to a new routine hugetlb_vmemmap_optimize_folios() for vmemmap optimization. Due to significant differences in vmemmmap initialization for bootmem allocated hugetlb pages, a new routine prep_and_add_bootmem_folios is created. We also modify the routine vmemmap_should_optimize() to check for pages that are already optimized. There are code paths that might request vmemmap optimization twice and we want to make sure this is not attempted. Link: https://lkml.kernel.org/r/20231019023113.345257-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 43 +++++++++++++++++++++++++++++++++++-------- mm/hugetlb_vmemmap.c | 11 +++++++++++ mm/hugetlb_vmemmap.h | 5 +++++ 3 files changed, 51 insertions(+), 8 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 559f7c71c5969a..8b171f866d0ac3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2282,6 +2282,9 @@ static void prep_and_add_allocated_folios(struct hstate *h, unsigned long flags; struct folio *folio, *tmp_f; + /* Send list for bulk vmemmap optimization processing */ + hugetlb_vmemmap_optimize_folios(h, folio_list); + /* Add all new pool pages to free lists in one lock cycle */ spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { @@ -3344,6 +3347,35 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, prep_compound_head((struct page *)folio, huge_page_order(h)); } +static void __init prep_and_add_bootmem_folios(struct hstate *h, + struct list_head *folio_list) +{ + unsigned long flags; + struct folio *folio, *tmp_f; + + /* Send list for bulk vmemmap optimization processing */ + hugetlb_vmemmap_optimize_folios(h, folio_list); + + /* Add all new pool pages to free lists in one lock cycle */ + spin_lock_irqsave(&hugetlb_lock, flags); + list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { + if (!folio_test_hugetlb_vmemmap_optimized(folio)) { + /* + * If HVO fails, initialize all tail struct pages + * We do not worry about potential long lock hold + * time as this is early in boot and there should + * be no contention. + */ + hugetlb_folio_init_tail_vmemmap(folio, + HUGETLB_VMEMMAP_RESERVE_PAGES, + pages_per_huge_page(h)); + } + __prep_account_new_huge_page(h, folio_nid(folio)); + enqueue_hugetlb_folio(h, folio); + } + spin_unlock_irqrestore(&hugetlb_lock, flags); +} + /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_ORDER) pages. @@ -3364,7 +3396,7 @@ static void __init gather_bootmem_prealloc(void) * in this list. If so, process each size separately. */ if (h != prev_h && prev_h != NULL) - prep_and_add_allocated_folios(prev_h, &folio_list); + prep_and_add_bootmem_folios(prev_h, &folio_list); prev_h = h; VM_BUG_ON(!hstate_is_gigantic(h)); @@ -3372,12 +3404,7 @@ static void __init gather_bootmem_prealloc(void) hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); - __prep_new_hugetlb_folio(h, folio); - /* If HVO fails, initialize all tail struct pages */ - if (!HPageVmemmapOptimized(&folio->page)) - hugetlb_folio_init_tail_vmemmap(folio, - HUGETLB_VMEMMAP_RESERVE_PAGES, - pages_per_huge_page(h)); + init_new_hugetlb_folio(h, folio); list_add(&folio->lru, &folio_list); /* @@ -3389,7 +3416,7 @@ static void __init gather_bootmem_prealloc(void) cond_resched(); } - prep_and_add_allocated_folios(h, &folio_list); + prep_and_add_bootmem_folios(h, &folio_list); } static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 76682d1d79a746..4558b814ffabfe 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -483,6 +483,9 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) { + if (HPageVmemmapOptimized((struct page *)head)) + return false; + if (!READ_ONCE(vmemmap_optimize_enabled)) return false; @@ -572,6 +575,14 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) SetHPageVmemmapOptimized(head); } +void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) +{ + struct folio *folio; + + list_for_each_entry(folio, folio_list, lru) + hugetlb_vmemmap_optimize(h, &folio->page); +} + static struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 4573899855d706..c512e388dbb493 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -20,6 +20,7 @@ #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { @@ -48,6 +49,10 @@ static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page { } +static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) +{ +} + static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { return 0; -- cgit 1.2.3-korg From cfb8c75099dbf152db1b075eead5029c5a30ce51 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:06 -0700 Subject: hugetlb: perform vmemmap restoration on a list of pages The routine update_and_free_pages_bulk already performs vmemmap restoration on the list of hugetlb pages in a separate step. In preparation for more functionality to be added in this step, create a new routine hugetlb_vmemmap_restore_folios() that will restore vmemmap for a list of folios. This new routine must provide sufficient feedback about errors and actual restoration performed so that update_and_free_pages_bulk can perform optimally. Special care must be taken when encountering an error from hugetlb_vmemmap_restore_folios. We want to continue making as much forward progress as possible. A new routine bulk_vmemmap_restore_error handles this specific situation. Link: https://lkml.kernel.org/r/20231019023113.345257-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 99 +++++++++++++++++++++++++++++++++++++--------------- mm/hugetlb_vmemmap.c | 38 ++++++++++++++++++++ mm/hugetlb_vmemmap.h | 11 ++++++ 3 files changed, 120 insertions(+), 28 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8b171f866d0ac3..cf834bb7f820a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1859,50 +1859,93 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, schedule_work(&free_hpage_work); } -static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) +static void bulk_vmemmap_restore_error(struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; - bool clear_dtor = false; - /* - * First allocate required vmemmmap (if necessary) for all folios on - * list. If vmemmap can not be allocated, we can not free folio to - * lower level allocator, so add back as hugetlb surplus page. - * add_hugetlb_folio() removes the page from THIS list. - * Use clear_dtor to note if vmemmap was successfully allocated for - * ANY page on the list. - */ - list_for_each_entry_safe(folio, t_folio, list, lru) { - if (folio_test_hugetlb_vmemmap_optimized(folio)) { + if (!list_empty(non_hvo_folios)) { + /* + * Free any restored hugetlb pages so that restore of the + * entire list can be retried. + * The idea is that in the common case of ENOMEM errors freeing + * hugetlb pages with vmemmap we will free up memory so that we + * can allocate vmemmap for more hugetlb pages. + */ + list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) { + list_del(&folio->lru); + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + update_and_free_hugetlb_folio(h, folio, false); + cond_resched(); + } + } else { + /* + * In the case where there are no folios which can be + * immediately freed, we loop through the list trying to restore + * vmemmap individually in the hope that someone elsewhere may + * have done something to cause success (such as freeing some + * memory). If unable to restore a hugetlb page, the hugetlb + * page is made a surplus page and removed from the list. + * If are able to restore vmemmap and free one hugetlb page, we + * quit processing the list to retry the bulk operation. + */ + list_for_each_entry_safe(folio, t_folio, folio_list, lru) if (hugetlb_vmemmap_restore(h, &folio->page)) { + list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); - } else - clear_dtor = true; - } + } else { + list_del(&folio->lru); + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + update_and_free_hugetlb_folio(h, folio, false); + cond_resched(); + break; + } } +} + +static void update_and_free_pages_bulk(struct hstate *h, + struct list_head *folio_list) +{ + long ret; + struct folio *folio, *t_folio; + LIST_HEAD(non_hvo_folios); /* - * If vmemmmap allocation was performed on any folio above, take lock - * to clear destructor of all folios on list. This avoids the need to - * lock/unlock for each individual folio. - * The assumption is vmemmap allocation was performed on all or none - * of the folios on the list. This is true expect in VERY rare cases. + * First allocate required vmemmmap (if necessary) for all folios. + * Carefully handle errors and free up any available hugetlb pages + * in an effort to make forward progress. */ - if (clear_dtor) { +retry: + ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios); + if (ret < 0) { + bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios); + goto retry; + } + + /* + * At this point, list should be empty, ret should be >= 0 and there + * should only be pages on the non_hvo_folios list. + * Do note that the non_hvo_folios list could be empty. + * Without HVO enabled, ret will be 0 and there is no need to call + * __clear_hugetlb_destructor as this was done previously. + */ + VM_WARN_ON(!list_empty(folio_list)); + VM_WARN_ON(ret < 0); + if (!list_empty(&non_hvo_folios) && ret) { spin_lock_irq(&hugetlb_lock); - list_for_each_entry(folio, list, lru) + list_for_each_entry(folio, &non_hvo_folios, lru) __clear_hugetlb_destructor(h, folio); spin_unlock_irq(&hugetlb_lock); } - /* - * Free folios back to low level allocators. vmemmap and destructors - * were taken care of above, so update_and_free_hugetlb_folio will - * not need to take hugetlb lock. - */ - list_for_each_entry_safe(folio, t_folio, list, lru) { + list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4558b814ffabfe..77f44b81ff01d8 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -480,6 +480,44 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) return ret; } +/** + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. + * @h: hstate. + * @folio_list: list of folios. + * @non_hvo_folios: Output list of folios for which vmemmap exists. + * + * Return: number of folios for which vmemmap was restored, or an error code + * if an error was encountered restoring vmemmap for a folio. + * Folios that have vmemmap are moved to the non_hvo_folios + * list. Processing of entries stops when the first error is + * encountered. The folio that experienced the error and all + * non-processed folios will remain on folio_list. + */ +long hugetlb_vmemmap_restore_folios(const struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios) +{ + struct folio *folio, *t_folio; + long restored = 0; + long ret = 0; + + list_for_each_entry_safe(folio, t_folio, folio_list, lru) { + if (folio_test_hugetlb_vmemmap_optimized(folio)) { + ret = hugetlb_vmemmap_restore(h, &folio->page); + if (ret) + break; + restored++; + } + + /* Add non-optimized folios to output list */ + list_move(&folio->lru, non_hvo_folios); + } + + if (!ret) + ret = restored; + return ret; +} + /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) { diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index c512e388dbb493..a0dcf49f46bab3 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -19,6 +19,9 @@ #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +long hugetlb_vmemmap_restore_folios(const struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios); void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); @@ -45,6 +48,14 @@ static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *h return 0; } +static long hugetlb_vmemmap_restore_folios(const struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios) +{ + list_splice_init(folio_list, non_hvo_folios); + return 0; +} + static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { } -- cgit 1.2.3-korg From c5ad3233ead566d74918bd76ba2dbdbe83ba1d9d Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 11 Oct 2023 15:45:57 +0100 Subject: hugetlb_vmemmap: use folio argument for hugetlb_vmemmap_* functions Most function calls in hugetlb.c are made with folio arguments. This brings hugetlb_vmemmap calls inline with them by using folio instead of head struct page. Head struct page is still needed within these functions. The set/clear/test functions for hugepages are also changed to folio versions. Link: https://lkml.kernel.org/r/20231011144557.1720481-2-usama.arif@bytedance.com Signed-off-by: Usama Arif Reviewed-by: Muchun Song Cc: Fam Zheng Cc: Mike Kravetz Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/hugetlb.c | 14 +++++++------- mm/hugetlb_vmemmap.c | 50 ++++++++++++++++++++++++++------------------------ mm/hugetlb_vmemmap.h | 8 ++++---- 3 files changed, 37 insertions(+), 35 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cf834bb7f820a5..dd8065e360388a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1747,10 +1747,10 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, /* * If folio is not vmemmap optimized (!clear_dtor), then the folio - * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore + * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio * can only be passed hugetlb pages and will BUG otherwise. */ - if (clear_dtor && hugetlb_vmemmap_restore(h, &folio->page)) { + if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1893,7 +1893,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h, * quit processing the list to retry the bulk operation. */ list_for_each_entry_safe(folio, t_folio, folio_list, lru) - if (hugetlb_vmemmap_restore(h, &folio->page)) { + if (hugetlb_vmemmap_restore_folio(h, folio)) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, true); @@ -2051,7 +2051,7 @@ static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { init_new_hugetlb_folio(h, folio); - hugetlb_vmemmap_optimize(h, &folio->page); + hugetlb_vmemmap_optimize_folio(h, folio); } static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) @@ -2462,7 +2462,7 @@ int dissolve_free_huge_page(struct page *page) * non-vmemmap optimized hugetlb folios. */ if (folio_test_hugetlb(folio)) { - rc = hugetlb_vmemmap_restore(h, &folio->page); + rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, false); @@ -3886,11 +3886,11 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) /* * If vmemmap already existed for folio, the remove routine above would * have cleared the hugetlb folio flag. Hence the folio is technically - * no longer a hugetlb folio. hugetlb_vmemmap_restore can only be + * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be * passed hugetlb folios and will BUG otherwise. */ if (folio_test_hugetlb(folio)) { - rc = hugetlb_vmemmap_restore(h, &folio->page); + rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { /* Allocation of vmemmmap failed, we can not demote folio */ spin_lock_irq(&hugetlb_lock); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index d2999c303031df..87818ee7f01d7e 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -495,14 +495,15 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); -static int __hugetlb_vmemmap_restore(const struct hstate *h, struct page *head, unsigned long flags) +static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags) { int ret; + struct page *head = &folio->page; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE(!PageHuge(head)); - if (!HPageVmemmapOptimized(head)) + if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); @@ -518,7 +519,7 @@ static int __hugetlb_vmemmap_restore(const struct hstate *h, struct page *head, */ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); if (!ret) { - ClearHPageVmemmapOptimized(head); + folio_clear_hugetlb_vmemmap_optimized(folio); static_branch_dec(&hugetlb_optimize_vmemmap_key); } @@ -526,18 +527,18 @@ static int __hugetlb_vmemmap_restore(const struct hstate *h, struct page *head, } /** - * hugetlb_vmemmap_restore - restore previously optimized (by - * hugetlb_vmemmap_optimize()) vmemmap pages which + * hugetlb_vmemmap_restore_folio - restore previously optimized (by + * hugetlb_vmemmap_optimize_folio()) vmemmap pages which * will be reallocated and remapped. * @h: struct hstate. - * @head: the head page whose vmemmap pages will be restored. + * @folio: the folio whose vmemmap pages will be restored. * - * Return: %0 if @head's vmemmap pages have been reallocated and remapped, + * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, * negative error code otherwise. */ -int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) +int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { - return __hugetlb_vmemmap_restore(h, head, 0); + return __hugetlb_vmemmap_restore_folio(h, folio, 0); } /** @@ -563,7 +564,7 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { - ret = __hugetlb_vmemmap_restore(h, &folio->page, + ret = __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_REMAP_NO_TLB_FLUSH); if (ret) break; @@ -640,12 +641,13 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h return true; } -static int __hugetlb_vmemmap_optimize(const struct hstate *h, - struct page *head, +static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, + struct folio *folio, struct list_head *vmemmap_pages, unsigned long flags) { int ret = 0; + struct page *head = &folio->page; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; @@ -665,7 +667,7 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h, * If there is an error during optimization, we will immediately FLUSH * the TLB and clear the flag below. */ - SetHPageVmemmapOptimized(head); + folio_set_hugetlb_vmemmap_optimized(folio); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; @@ -681,27 +683,27 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h, vmemmap_pages, flags); if (ret) { static_branch_dec(&hugetlb_optimize_vmemmap_key); - ClearHPageVmemmapOptimized(head); + folio_clear_hugetlb_vmemmap_optimized(folio); } return ret; } /** - * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. + * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. * @h: struct hstate. - * @head: the head page whose vmemmap pages will be optimized. + * @folio: the folio whose vmemmap pages will be optimized. * - * This function only tries to optimize @head's vmemmap pages and does not + * This function only tries to optimize @folio's vmemmap pages and does not * guarantee that the optimization will succeed after it returns. The caller - * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages - * have been optimized. + * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's + * vmemmap pages have been optimized. */ -void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) +void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); - __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); free_vmemmap_page_list(&vmemmap_pages); } @@ -745,7 +747,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); list_for_each_entry(folio, folio_list, lru) { - int ret = __hugetlb_vmemmap_optimize(h, &folio->page, + int ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_REMAP_NO_TLB_FLUSH); @@ -754,14 +756,14 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l * encounter an ENOMEM, free what we have and try again. * This can occur in the case that both spliting fails * halfway and head page allocation also failed. In this - * case __hugetlb_vmemmap_optimize() would free memory + * case __hugetlb_vmemmap_optimize_folio() would free memory * allowing more vmemmap remaps to occur. */ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); - __hugetlb_vmemmap_optimize(h, &folio->page, + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_REMAP_NO_TLB_FLUSH); } diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index a0dcf49f46bab3..9fd0cb2705026e 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -18,11 +18,11 @@ #define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio); long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios); -void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) @@ -43,7 +43,7 @@ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate return size > 0 ? size : 0; } #else -static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) +static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return 0; } @@ -56,7 +56,7 @@ static long hugetlb_vmemmap_restore_folios(const struct hstate *h, return 0; } -static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) +static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { } -- cgit 1.2.3-korg From 72e315f7a750281b4410ac30d8930f735459e72d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:27:47 -0700 Subject: mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 9 ------ mm/hugetlb.c | 38 +++++++++++----------- mm/mempolicy.c | 83 +++++++++++++++++++++++++------------------------ 3 files changed, 63 insertions(+), 67 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 158ff156710bed..d3acecc5db4b33 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -748,8 +748,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address); int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, @@ -1072,13 +1070,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return NULL; } -static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, - struct vm_area_struct *vma, - unsigned long address) -{ - return NULL; -} - static inline int __alloc_bootmem_huge_page(struct hstate *h) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd8065e360388a..1169ef2f2176fa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2630,24 +2630,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } -/* mempolicy aware migration callback */ -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address) -{ - struct mempolicy *mpol; - nodemask_t *nodemask; - struct folio *folio; - gfp_t gfp_mask; - int node; - - gfp_mask = htlb_alloc_mask(h); - node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); - mpol_cond_put(mpol); - - return folio; -} - /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -6559,6 +6541,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } #ifdef CONFIG_USERFAULTFD +/* + * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte(). + */ +static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct mempolicy *mpol; + nodemask_t *nodemask; + struct folio *folio; + gfp_t gfp_mask; + int node; + + gfp_mask = htlb_alloc_mask(h); + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); + mpol_cond_put(mpol); + + return folio; +} + /* * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ded5508f09729f..0594362f247087 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -415,6 +415,8 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid); static bool strictly_unmovable(unsigned long flags) { @@ -1021,6 +1023,8 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, node_set(source, nmask); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + + mmap_read_lock(mm); vma = find_vma(mm, 0); /* @@ -1031,6 +1035,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, */ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); + mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, @@ -1059,8 +1064,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, lru_cache_disable(); - mmap_read_lock(mm); - /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' @@ -1140,7 +1143,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (err < 0) break; } - mmap_read_unlock(mm); lru_cache_enable(); if (err < 0) @@ -1149,44 +1151,38 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, } /* - * Allocate a new page for page migration based on vma policy. - * Start by assuming the page is mapped by the same vma as contains @start. - * Search forward from there, if not. N.B., this assumes that the - * list of pages handed to migrate_pages()--which is how we get here-- - * is in virtual address order. + * Allocate a new folio for page migration, according to NUMA mempolicy. */ -static struct folio *new_folio(struct folio *src, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { - struct vm_area_struct *vma; - unsigned long address; - VMA_ITERATOR(vmi, current->mm, start); - gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; - - for_each_vma(vmi, vma) { - address = page_address_in_vma(&src->page, vma); - if (address != -EFAULT) - break; - } + struct mempolicy *pol = (struct mempolicy *)private; + pgoff_t ilx = 0; /* improve on this later */ + struct page *page; + unsigned int order; + int nid = numa_node_id(); + gfp_t gfp; - /* - * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL - * when the page can no longer be located in a vma: that is not ideal - * (migrate_pages() will give up early, presuming ENOMEM), but good - * enough to avoid a crash by syzkaller or concurrent holepunch. - */ - if (!vma) - return NULL; + order = folio_order(src); + ilx += src->index >> order; if (folio_test_hugetlb(src)) { - return alloc_hugetlb_folio_vma(folio_hstate(src), - vma, address); + nodemask_t *nodemask; + struct hstate *h; + + h = folio_hstate(src); + gfp = htlb_alloc_mask(h); + nodemask = policy_nodemask(gfp, pol, ilx, &nid); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; + else + gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; - return vma_alloc_folio(gfp, folio_order(src), vma, address, - folio_test_large(src)); + page = alloc_pages_mpol(gfp, order, pol, ilx, nid); + return page_rmappable_folio(page); } #else @@ -1202,7 +1198,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct folio *new_folio(struct folio *src, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { return NULL; } @@ -1276,6 +1273,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (nr_failed < 0) { err = nr_failed; + nr_failed = 0; } else { vma_iter_init(&vmi, mm, start); prev = vma_prev(&vmi); @@ -1286,19 +1284,24 @@ static long do_mbind(unsigned long start, unsigned long len, } } - if (!err) { - if (!list_empty(&pagelist)) { - nr_failed |= migrate_pages(&pagelist, new_folio, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); + mmap_write_unlock(mm); + + if (!err && !list_empty(&pagelist)) { + /* Convert MPOL_DEFAULT's NULL to task or default policy */ + if (!new) { + new = get_task_policy(current); + mpol_get(new); } - if (nr_failed && (flags & MPOL_MF_STRICT)) - err = -EIO; + nr_failed |= migrate_pages(&pagelist, + alloc_migration_target_by_mpol, NULL, + (unsigned long)new, MIGRATE_SYNC, + MR_MEMPOLICY_MBIND, NULL); } + if (nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - - mmap_write_unlock(mm); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) -- cgit 1.2.3-korg