From f41f2ed43ca5258d70d53290d1951a21621f95c8 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 30 Jun 2021 18:47:13 -0700 Subject: mm: hugetlb: free the vmemmap pages associated with each HugeTLB page Every HugeTLB has more than one struct page structure. We __know__ that we only use the first 4 (__NR_USED_SUBPAGE) struct page structures to store metadata associated with each HugeTLB. There are a lot of struct page structures associated with each HugeTLB page. For tail pages, the value of compound_head is the same. So we can reuse first page of tail page structures. We map the virtual addresses of the remaining pages of tail page structures to the first tail page struct, and then free these page frames. Therefore, we need to reserve two pages as vmemmap areas. When we allocate a HugeTLB page from the buddy, we can free some vmemmap pages associated with each HugeTLB page. It is more appropriate to do it in the prep_new_huge_page(). The free_vmemmap_pages_per_hpage(), which indicates how many vmemmap pages associated with a HugeTLB page can be freed, returns zero for now, which means the feature is disabled. We will enable it once all the infrastructure is there. [willy@infradead.org: fix documentation warning] Link: https://lkml.kernel.org/r/20210615200242.1716568-5-willy@infradead.org Link: https://lkml.kernel.org/r/20210510030027.56044-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Oscar Salvador Tested-by: Chen Huang Tested-by: Bodeddula Balasubramaniam Acked-by: Michal Hocko Reviewed-by: Mike Kravetz Cc: Alexander Viro Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Balbir Singh Cc: Barry Song Cc: Borislav Petkov Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: HORIGUCHI NAOYA Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joao Martins Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Oliver Neukum Cc: Paul E. McKenney Cc: Pawan Gupta Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Thomas Gleixner Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 103f1187043f8f..5f5493f0f00336 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -41,6 +41,7 @@ #include #include #include "internal.h" +#include "hugetlb_vmemmap.h" int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; @@ -1493,8 +1494,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) h->nr_huge_pages_node[nid]++; } -static void __prep_new_huge_page(struct page *page) +static void __prep_new_huge_page(struct hstate *h, struct page *page) { + free_huge_page_vmemmap(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); @@ -1504,7 +1506,7 @@ static void __prep_new_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { - __prep_new_huge_page(page); + __prep_new_huge_page(h, page); spin_lock_irq(&hugetlb_lock); __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); @@ -2351,14 +2353,15 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, /* * Before dissolving the page, we need to allocate a new one for the - * pool to remain stable. Using alloc_buddy_huge_page() allows us to - * not having to deal with prep_new_huge_page() and avoids dealing of any - * counters. This simplifies and let us do the whole thing under the - * lock. + * pool to remain stable. Here, we allocate the page and 'prep' it + * by doing everything but actually updating counters and adding to + * the pool. This simplifies and let us do most of the processing + * under the lock. */ new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); if (!new_page) return -ENOMEM; + __prep_new_huge_page(h, new_page); retry: spin_lock_irq(&hugetlb_lock); @@ -2397,14 +2400,9 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, remove_hugetlb_page(h, old_page, false); /* - * new_page needs to be initialized with the standard hugetlb - * state. This is normally done by prep_new_huge_page() but - * that takes hugetlb_lock which is already held so we need to - * open code it here. * Reference count trick is needed because allocator gives us * referenced page but the pool requires pages with 0 refcount. */ - __prep_new_huge_page(new_page); __prep_account_new_huge_page(h, nid); page_ref_dec(new_page); enqueue_huge_page(h, new_page); @@ -2420,7 +2418,7 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, free_new: spin_unlock_irq(&hugetlb_lock); - __free_pages(new_page, huge_page_order(h)); + update_and_free_page(h, new_page); return ret; } -- cgit 1.2.3-korg From b65d4adbc0f0d4619f61ee9d8126bc5005b78802 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 30 Jun 2021 18:47:17 -0700 Subject: mm: hugetlb: defer freeing of HugeTLB pages In the subsequent patch, we should allocate the vmemmap pages when freeing a HugeTLB page. But update_and_free_page() can be called under any context, so we cannot use GFP_KERNEL to allocate vmemmap pages. However, we can defer the actual freeing in a kworker to prevent from using GFP_ATOMIC to allocate the vmemmap pages. The __update_and_free_page() is where the call to allocate vmemmmap pages will be inserted. Link: https://lkml.kernel.org/r/20210510030027.56044-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Reviewed-by: Oscar Salvador Cc: Alexander Viro Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Balbir Singh Cc: Barry Song Cc: Bodeddula Balasubramaniam Cc: Borislav Petkov Cc: Chen Huang Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: HORIGUCHI NAOYA Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joao Martins Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mina Almasry Cc: Oliver Neukum Cc: Paul E. McKenney Cc: Pawan Gupta Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Thomas Gleixner Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++----- mm/hugetlb_vmemmap.c | 12 -------- mm/hugetlb_vmemmap.h | 17 +++++++++++ 3 files changed, 93 insertions(+), 19 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5f5493f0f00336..e7eb1ab8c78a27 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1376,7 +1376,7 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, h->nr_huge_pages_node[nid]--; } -static void update_and_free_page(struct hstate *h, struct page *page) +static void __update_and_free_page(struct hstate *h, struct page *page) { int i; struct page *subpage = page; @@ -1399,12 +1399,79 @@ static void update_and_free_page(struct hstate *h, struct page *page) } } +/* + * As update_and_free_page() can be called under any context, so we cannot + * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the + * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate + * the vmemmap pages. + * + * free_hpage_workfn() locklessly retrieves the linked list of pages to be + * freed and frees them one-by-one. As the page->mapping pointer is going + * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node + * structure of a lockless linked list of huge pages to be freed. + */ +static LLIST_HEAD(hpage_freelist); + +static void free_hpage_workfn(struct work_struct *work) +{ + struct llist_node *node; + + node = llist_del_all(&hpage_freelist); + + while (node) { + struct page *page; + struct hstate *h; + + page = container_of((struct address_space **)node, + struct page, mapping); + node = node->next; + page->mapping = NULL; + /* + * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate() + * is going to trigger because a previous call to + * remove_hugetlb_page() will set_compound_page_dtor(page, + * NULL_COMPOUND_DTOR), so do not use page_hstate() directly. + */ + h = size_to_hstate(page_size(page)); + + __update_and_free_page(h, page); + + cond_resched(); + } +} +static DECLARE_WORK(free_hpage_work, free_hpage_workfn); + +static inline void flush_free_hpage_work(struct hstate *h) +{ + if (free_vmemmap_pages_per_hpage(h)) + flush_work(&free_hpage_work); +} + +static void update_and_free_page(struct hstate *h, struct page *page, + bool atomic) +{ + if (!free_vmemmap_pages_per_hpage(h) || !atomic) { + __update_and_free_page(h, page); + return; + } + + /* + * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. + * + * Only call schedule_work() if hpage_freelist is previously + * empty. Otherwise, schedule_work() had been called but the workfn + * hasn't retrieved the list yet. + */ + if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist)) + schedule_work(&free_hpage_work); +} + static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { struct page *page, *t_page; list_for_each_entry_safe(page, t_page, list, lru) { - update_and_free_page(h, page); + update_and_free_page(h, page, false); cond_resched(); } } @@ -1471,12 +1538,12 @@ void free_huge_page(struct page *page) if (HPageTemporary(page)) { remove_hugetlb_page(h, page, false); spin_unlock_irqrestore(&hugetlb_lock, flags); - update_and_free_page(h, page); + update_and_free_page(h, page, true); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ remove_hugetlb_page(h, page, true); spin_unlock_irqrestore(&hugetlb_lock, flags); - update_and_free_page(h, page); + update_and_free_page(h, page, true); } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); @@ -1795,7 +1862,7 @@ int dissolve_free_huge_page(struct page *page) remove_hugetlb_page(h, head, false); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); - update_and_free_page(h, head); + update_and_free_page(h, head, false); return 0; } out: @@ -2411,14 +2478,14 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, * Pages have been replaced, we can safely free the old one. */ spin_unlock_irq(&hugetlb_lock); - update_and_free_page(h, old_page); + update_and_free_page(h, old_page, false); } return ret; free_new: spin_unlock_irq(&hugetlb_lock); - update_and_free_page(h, new_page); + update_and_free_page(h, new_page, false); return ret; } @@ -2832,6 +2899,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * pages in hstate via the proc/sysfs interfaces. */ mutex_lock(&h->resize_lock); + flush_free_hpage_work(h); spin_lock_irq(&hugetlb_lock); /* @@ -2941,6 +3009,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, /* free the pages after dropping lock */ spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); + flush_free_hpage_work(h); spin_lock_irq(&hugetlb_lock); while (count < persistent_huge_pages(h)) { diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index e45a138a7f8558..cb28c5b6c9ff77 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -180,18 +180,6 @@ #define RESERVE_VMEMMAP_NR 2U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -/* - * How many vmemmap pages associated with a HugeTLB page that can be freed - * to the buddy allocator. - * - * Todo: Returns zero for now, which means the feature is disabled. We will - * enable it once all the infrastructure is there. - */ -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) -{ - return 0; -} - static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h) { return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT; diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 6923f03534d50b..01f8637adbe0c7 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -12,9 +12,26 @@ #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP void free_huge_page_vmemmap(struct hstate *h, struct page *head); + +/* + * How many vmemmap pages associated with a HugeTLB page that can be freed + * to the buddy allocator. + * + * Todo: Returns zero for now, which means the feature is disabled. We will + * enable it once all the infrastructure is there. + */ +static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +{ + return 0; +} #else static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head) { } + +static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +{ + return 0; +} #endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ #endif /* _LINUX_HUGETLB_VMEMMAP_H */ -- cgit 1.2.3-korg From ad2fa3717b74994a22519dbe045757135db00dbb Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 30 Jun 2021 18:47:21 -0700 Subject: mm: hugetlb: alloc the vmemmap pages associated with each HugeTLB page When we free a HugeTLB page to the buddy allocator, we need to allocate the vmemmap pages associated with it. However, we may not be able to allocate the vmemmap pages when the system is under memory pressure. In this case, we just refuse to free the HugeTLB page. This changes behavior in some corner cases as listed below: 1) Failing to free a huge page triggered by the user (decrease nr_pages). User needs to try again later. 2) Failing to free a surplus huge page when freed by the application. Try again later when freeing a huge page next time. 3) Failing to dissolve a free huge page on ZONE_MOVABLE via offline_pages(). This can happen when we have plenty of ZONE_MOVABLE memory, but not enough kernel memory to allocate vmemmmap pages. We may even be able to migrate huge page contents, but will not be able to dissolve the source huge page. This will prevent an offline operation and is unfortunate as memory offlining is expected to succeed on movable zones. Users that depend on memory hotplug to succeed for movable zones should carefully consider whether the memory savings gained from this feature are worth the risk of possibly not being able to offline memory in certain situations. 4) Failing to dissolve a huge page on CMA/ZONE_MOVABLE via alloc_contig_range() - once we have that handling in place. Mainly affects CMA and virtio-mem. Similar to 3). virito-mem will handle migration errors gracefully. CMA might be able to fallback on other free areas within the CMA region. Vmemmap pages are allocated from the page freeing context. In order for those allocations to be not disruptive (e.g. trigger oom killer) __GFP_NORETRY is used. hugetlb_lock is dropped for the allocation because a non sleeping allocation would be too fragile and it could fail too easily under memory pressure. GFP_ATOMIC or other modes to access memory reserves is not used because we want to prevent consuming reserves under heavy hugetlb freeing. [mike.kravetz@oracle.com: fix dissolve_free_huge_page use of tail/head page] Link: https://lkml.kernel.org/r/20210527231225.226987-1-mike.kravetz@oracle.com [willy@infradead.org: fix alloc_vmemmap_page_list documentation warning] Link: https://lkml.kernel.org/r/20210615200242.1716568-6-willy@infradead.org Link: https://lkml.kernel.org/r/20210510030027.56044-7-songmuchun@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Mike Kravetz Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Oscar Salvador Cc: Alexander Viro Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Balbir Singh Cc: Barry Song Cc: Bodeddula Balasubramaniam Cc: Borislav Petkov Cc: Chen Huang Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: HORIGUCHI NAOYA Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joao Martins Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mina Almasry Cc: Oliver Neukum Cc: Paul E. McKenney Cc: Pawan Gupta Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Thomas Gleixner Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/hugetlbpage.rst | 8 ++ Documentation/admin-guide/mm/memory-hotplug.rst | 13 ++++ include/linux/hugetlb.h | 3 + include/linux/mm.h | 2 + mm/hugetlb.c | 98 +++++++++++++++++++++---- mm/hugetlb_vmemmap.c | 34 +++++++++ mm/hugetlb_vmemmap.h | 6 ++ mm/migrate.c | 5 +- mm/sparse-vmemmap.c | 75 ++++++++++++++++++- 9 files changed, 227 insertions(+), 17 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index f7b1c746299119..6988895d09a87a 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -60,6 +60,10 @@ HugePages_Surp the pool above the value in ``/proc/sys/vm/nr_hugepages``. The maximum number of surplus huge pages is controlled by ``/proc/sys/vm/nr_overcommit_hugepages``. + Note: When the feature of freeing unused vmemmap pages associated + with each hugetlb page is enabled, the number of surplus huge pages + may be temporarily larger than the maximum number of surplus huge + pages when the system is under memory pressure. Hugepagesize is the default hugepage size (in Kb). Hugetlb @@ -80,6 +84,10 @@ returned to the huge page pool when freed by a task. A user with root privileges can dynamically allocate more or free some persistent huge pages by increasing or decreasing the value of ``nr_hugepages``. +Note: When the feature of freeing unused vmemmap pages associated with each +hugetlb page is enabled, we can fail to free the huge pages triggered by +the user when ths system is under memory pressure. Please try again later. + Pages that are used as huge pages are reserved inside the kernel and cannot be used for other purposes. Huge pages cannot be swapped out under memory pressure. diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 05d51d2d8beb73..c6bae2d7716096 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -357,6 +357,19 @@ creates ZONE_MOVABLE as following. Unfortunately, there is no information to show which memory block belongs to ZONE_MOVABLE. This is TBD. + Memory offlining can fail when dissolving a free huge page on ZONE_MOVABLE + and the feature of freeing unused vmemmap pages associated with each hugetlb + page is enabled. + + This can happen when we have plenty of ZONE_MOVABLE memory, but not enough + kernel memory to allocate vmemmmap pages. We may even be able to migrate + huge page contents, but will not be able to dissolve the source huge page. + This will prevent an offline operation and is unfortunate as memory offlining + is expected to succeed on movable zones. Users that depend on memory hotplug + to succeed for movable zones should carefully consider whether the memory + savings gained from this feature are worth the risk of possibly not being + able to offline memory in certain situations. + .. note:: Techniques that rely on long-term pinnings of memory (especially, RDMA and vfio) are fundamentally problematic with ZONE_MOVABLE and, therefore, memory diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0c8c96481259f8..3578d9d708feea 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -532,12 +532,14 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * modifications require hugetlb_lock. * HPG_freed - Set when page is on the free lists. * Synchronization: hugetlb_lock held for examination and modification. + * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, HPG_migratable, HPG_temporary, HPG_freed, + HPG_vmemmap_optimized, __NR_HPAGEFLAGS, }; @@ -583,6 +585,7 @@ HPAGEFLAG(RestoreReserve, restore_reserve) HPAGEFLAG(Migratable, migratable) HPAGEFLAG(Temporary, temporary) HPAGEFLAG(Freed, freed) +HPAGEFLAG(VmemmapOptimized, vmemmap_optimized) #ifdef CONFIG_HUGETLB_PAGE diff --git a/include/linux/mm.h b/include/linux/mm.h index 3437aa7c6c9105..706bee98d9657f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3078,6 +3078,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) void vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); +int vmemmap_remap_alloc(unsigned long start, unsigned long end, + unsigned long reuse, gfp_t gfp_mask); void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e7eb1ab8c78a27..778db5de6232e4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1376,6 +1376,39 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, h->nr_huge_pages_node[nid]--; } +static void add_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + int zeroed; + int nid = page_to_nid(page); + + VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page); + + lockdep_assert_held(&hugetlb_lock); + + INIT_LIST_HEAD(&page->lru); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; + + if (adjust_surplus) { + h->surplus_huge_pages++; + h->surplus_huge_pages_node[nid]++; + } + + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + set_page_private(page, 0); + SetHPageVmemmapOptimized(page); + + /* + * This page is now managed by the hugetlb allocator and has + * no users -- drop the last reference. + */ + zeroed = put_page_testzero(page); + VM_BUG_ON_PAGE(!zeroed, page); + arch_clear_hugepage_flags(page); + enqueue_huge_page(h, page); +} + static void __update_and_free_page(struct hstate *h, struct page *page) { int i; @@ -1384,6 +1417,18 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; + if (alloc_huge_page_vmemmap(h, page)) { + spin_lock_irq(&hugetlb_lock); + /* + * If we cannot allocate vmemmap pages, just refuse to free the + * page and put the page back on the hugetlb free list and treat + * as a surplus page. + */ + add_hugetlb_page(h, page, true); + spin_unlock_irq(&hugetlb_lock); + return; + } + for (i = 0; i < pages_per_huge_page(h); i++, subpage = mem_map_next(subpage, page, i)) { subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -1450,7 +1495,7 @@ static inline void flush_free_hpage_work(struct hstate *h) static void update_and_free_page(struct hstate *h, struct page *page, bool atomic) { - if (!free_vmemmap_pages_per_hpage(h) || !atomic) { + if (!HPageVmemmapOptimized(page) || !atomic) { __update_and_free_page(h, page); return; } @@ -1806,10 +1851,14 @@ static struct page *remove_pool_huge_page(struct hstate *h, * nothing for in-use hugepages and non-hugepages. * This function returns values like below: * - * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use - * (allocated or reserved.) - * 0: successfully dissolved free hugepages or the page is not a - * hugepage (considered as already dissolved) + * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages + * when the system is under memory pressure and the feature of + * freeing unused vmemmap pages associated with each hugetlb page + * is enabled. + * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use + * (allocated or reserved.) + * 0: successfully dissolved free hugepages or the page is not a + * hugepage (considered as already dissolved) */ int dissolve_free_huge_page(struct page *page) { @@ -1851,19 +1900,38 @@ int dissolve_free_huge_page(struct page *page) goto retry; } - /* - * Move PageHWPoison flag from head page to the raw error page, - * which makes any subpages rather than the error page reusable. - */ - if (PageHWPoison(head) && page != head) { - SetPageHWPoison(page); - ClearPageHWPoison(head); - } remove_hugetlb_page(h, head, false); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); - update_and_free_page(h, head, false); - return 0; + + /* + * Normally update_and_free_page will allocate required vmemmmap + * before freeing the page. update_and_free_page will fail to + * free the page if it can not allocate required vmemmap. We + * need to adjust max_huge_pages if the page is not freed. + * Attempt to allocate vmemmmap here so that we can take + * appropriate action on failure. + */ + rc = alloc_huge_page_vmemmap(h, head); + if (!rc) { + /* + * Move PageHWPoison flag from head page to the raw + * error page, which makes any subpages rather than + * the error page reusable. + */ + if (PageHWPoison(head) && page != head) { + SetPageHWPoison(page); + ClearPageHWPoison(head); + } + update_and_free_page(h, head, false); + } else { + spin_lock_irq(&hugetlb_lock); + add_hugetlb_page(h, head, false); + h->max_huge_pages++; + spin_unlock_irq(&hugetlb_lock); + } + + return rc; } out: spin_unlock_irq(&hugetlb_lock); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index cb28c5b6c9ff77..a897c77782469f 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -185,6 +185,38 @@ static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h) return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT; } +/* + * Previously discarded vmemmap pages will be allocated and remapping + * after this function returns zero. + */ +int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +{ + int ret; + unsigned long vmemmap_addr = (unsigned long)head; + unsigned long vmemmap_end, vmemmap_reuse; + + if (!HPageVmemmapOptimized(head)) + return 0; + + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + /* + * The pages which the vmemmap virtual address range [@vmemmap_addr, + * @vmemmap_end) are mapped to are freed to the buddy allocator, and + * the range is mapped to the page which @vmemmap_reuse is mapped to. + * When a HugeTLB page is freed to the buddy allocator, previously + * discarded vmemmap pages must be allocated and remapping. + */ + ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, + GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); + + if (!ret) + ClearHPageVmemmapOptimized(head); + + return ret; +} + void free_huge_page_vmemmap(struct hstate *h, struct page *head) { unsigned long vmemmap_addr = (unsigned long)head; @@ -203,4 +235,6 @@ void free_huge_page_vmemmap(struct hstate *h, struct page *head) * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. */ vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse); + + SetHPageVmemmapOptimized(head); } diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 01f8637adbe0c7..a37771b0b82ac9 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -11,6 +11,7 @@ #include #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +int alloc_huge_page_vmemmap(struct hstate *h, struct page *head); void free_huge_page_vmemmap(struct hstate *h, struct page *head); /* @@ -25,6 +26,11 @@ static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) return 0; } #else +static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +{ + return 0; +} + static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head) { } diff --git a/mm/migrate.c b/mm/migrate.c index 380ca57b903147..cc4d6af4168349 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -626,7 +626,10 @@ void migrate_page_states(struct page *newpage, struct page *page) if (PageSwapCache(page)) ClearPageSwapCache(page); ClearPagePrivate(page); - set_page_private(page, 0); + + /* page->private contains hugetlb specific flags */ + if (!PageHuge(page)) + set_page_private(page, 0); /* * If any waiters have accumulated on the new page then diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 3ec5488c815cd7..a3aa275e26686e 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -40,7 +40,8 @@ * @remap_pte: called for each lowest-level entry (PTE). * @reuse_page: the page which is reused for the tail vmemmap pages. * @reuse_addr: the virtual address of the @reuse_page page. - * @vmemmap_pages: the list head of the vmemmap pages that can be freed. + * @vmemmap_pages: the list head of the vmemmap pages that can be freed + * or is mapped from. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, @@ -224,6 +225,78 @@ void vmemmap_remap_free(unsigned long start, unsigned long end, free_vmemmap_page_list(&vmemmap_pages); } +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + pgprot_t pgprot = PAGE_KERNEL; + struct page *page; + void *to; + + BUG_ON(pte_page(*pte) != walk->reuse_page); + + page = list_first_entry(walk->vmemmap_pages, struct page, lru); + list_del(&page->lru); + to = page_to_virt(page); + copy_page(to, (void *)walk->reuse_addr); + + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); +} + +static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, + gfp_t gfp_mask, struct list_head *list) +{ + unsigned long nr_pages = (end - start) >> PAGE_SHIFT; + int nid = page_to_nid((struct page *)start); + struct page *page, *next; + + while (nr_pages--) { + page = alloc_pages_node(nid, gfp_mask, 0); + if (!page) + goto out; + list_add_tail(&page->lru, list); + } + + return 0; +out: + list_for_each_entry_safe(page, next, list, lru) + __free_pages(page, 0); + return -ENOMEM; +} + +/** + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) + * to the page which is from the @vmemmap_pages + * respectively. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * @gfp_mask: GFP flag for allocating vmemmap pages. + */ +int vmemmap_remap_alloc(unsigned long start, unsigned long end, + unsigned long reuse, gfp_t gfp_mask) +{ + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); + + if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) + return -ENOMEM; + + vmemmap_remap_range(reuse, end, &walk); + + return 0; +} + /* * Allocate a block of memory to be used to back the virtual memory map * or to back the page tables that are used to create the mapping. -- cgit 1.2.3-korg From 774905878fc9b0b9a5ee4a889b97f773a077aeee Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 30 Jun 2021 18:47:33 -0700 Subject: mm: hugetlb: introduce nr_free_vmemmap_pages in the struct hstate All the infrastructure is ready, so we introduce nr_free_vmemmap_pages field in the hstate to indicate how many vmemmap pages associated with a HugeTLB page that can be freed to buddy allocator. And initialize it in the hugetlb_vmemmap_init(). This patch is actual enablement of the feature. There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP, so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. Link: https://lkml.kernel.org/r/20210510030027.56044-10-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Kravetz Reviewed-by: Oscar Salvador Reviewed-by: Miaohe Lin Tested-by: Chen Huang Tested-by: Bodeddula Balasubramaniam Cc: Alexander Viro Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Balbir Singh Cc: Barry Song Cc: Borislav Petkov Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: HORIGUCHI NAOYA Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joao Martins Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mina Almasry Cc: Oliver Neukum Cc: Paul E. McKenney Cc: Pawan Gupta Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Thomas Gleixner Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ mm/hugetlb.c | 1 + mm/hugetlb_vmemmap.c | 33 +++++++++++++++++++++++++++++++++ mm/hugetlb_vmemmap.h | 10 ++++++---- 4 files changed, 43 insertions(+), 4 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9ad99848f9f0ac..8c192084423687 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -608,6 +608,9 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP + unsigned int nr_free_vmemmap_pages; +#endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ struct cftype cgroup_files_dfl[7]; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 778db5de6232e4..dd9c90c082fc41 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3585,6 +3585,7 @@ void __init hugetlb_add_hstate(unsigned int order) h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); + hugetlb_vmemmap_init(h); parsed_hstate = h; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 3070e1465b1be9..f9f9bb21231983 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -262,3 +262,36 @@ void free_huge_page_vmemmap(struct hstate *h, struct page *head) SetHPageVmemmapOptimized(head); } + +void __init hugetlb_vmemmap_init(struct hstate *h) +{ + unsigned int nr_pages = pages_per_huge_page(h); + unsigned int vmemmap_pages; + + /* + * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct + * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP, + * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. + */ + BUILD_BUG_ON(__NR_USED_SUBPAGE >= + RESERVE_VMEMMAP_SIZE / sizeof(struct page)); + + if (!hugetlb_free_vmemmap_enabled) + return; + + vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; + /* + * The head page and the first tail page are not to be freed to buddy + * allocator, the other pages will map to the first tail page, so they + * can be freed. + * + * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true + * on some architectures (e.g. aarch64). See Documentation/arm64/ + * hugetlbpage.rst for more details. + */ + if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) + h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; + + pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages, + h->name); +} diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index a37771b0b82ac9..cb2bef8f9e736a 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -13,17 +13,15 @@ #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP int alloc_huge_page_vmemmap(struct hstate *h, struct page *head); void free_huge_page_vmemmap(struct hstate *h, struct page *head); +void hugetlb_vmemmap_init(struct hstate *h); /* * How many vmemmap pages associated with a HugeTLB page that can be freed * to the buddy allocator. - * - * Todo: Returns zero for now, which means the feature is disabled. We will - * enable it once all the infrastructure is there. */ static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) { - return 0; + return h->nr_free_vmemmap_pages; } #else static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) @@ -35,6 +33,10 @@ static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head) { } +static inline void hugetlb_vmemmap_init(struct hstate *h) +{ +} + static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) { return 0; -- cgit 1.2.3-korg From 79c1c594f49a88fba9744cb5c85978c6b1b365ec Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 30 Jun 2021 18:48:00 -0700 Subject: mm/hugetlb: change parameters of arch_make_huge_pte() Patch series "Subject: [PATCH v2 0/5] Implement huge VMAP and VMALLOC on powerpc 8xx", v2. This series implements huge VMAP and VMALLOC on powerpc 8xx. Powerpc 8xx has 4 page sizes: - 4k - 16k - 512k - 8M At the time being, vmalloc and vmap only support huge pages which are leaf at PMD level. Here the PMD level is 4M, it doesn't correspond to any supported page size. For now, implement use of 16k and 512k pages which is done at PTE level. Support of 8M pages will be implemented later, it requires use of hugepd tables. To allow this, the architecture provides two functions: - arch_vmap_pte_range_map_size() which tells vmap_pte_range() what page size to use. A stub returning PAGE_SIZE is provided when the architecture doesn't provide this function. - arch_vmap_pte_supported_shift() which tells __vmalloc_node_range() what page shift to use for a given area size. A stub returning PAGE_SHIFT is provided when the architecture doesn't provide this function. This patch (of 5): At the time being, arch_make_huge_pte() has the following prototype: pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable); vma is used to get the pages shift or size. vma is also used on Sparc to get vm_flags. page is not used. writable is not used. In order to use this function without a vma, replace vma by shift and flags. Also remove the used parameters. Link: https://lkml.kernel.org/r/cover.1620795204.git.christophe.leroy@csgroup.eu Link: https://lkml.kernel.org/r/f4633ac6a7da2f22f31a04a89e0a7026bb78b15b.1620795204.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Acked-by: Mike Kravetz Cc: Nicholas Piggin Cc: Mike Kravetz Cc: Mike Rapoport Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/hugetlb.h | 3 +-- arch/arm64/mm/hugetlbpage.c | 5 ++--- arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 5 ++--- arch/sparc/include/asm/pgtable_64.h | 3 +-- arch/sparc/mm/hugetlbpage.c | 6 ++---- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 6 ++++-- mm/migrate.c | 4 +++- 8 files changed, 17 insertions(+), 19 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 5abf91e3494c78..1242f71937f80e 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -23,8 +23,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) } #define arch_clear_hugepage_flags arch_clear_hugepage_flags -extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, - struct page *page, int writable); +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags); #define arch_make_huge_pte arch_make_huge_pte #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 58987a98e17986..23505fc3532470 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -339,10 +339,9 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return NULL; } -pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, - struct page *page, int writable) +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { - size_t pagesize = huge_page_size(hstate_vma(vma)); + size_t pagesize = 1UL << shift; if (pagesize == CONT_PTE_SIZE) { entry = pte_mkcont(entry); diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index 39be9aea86db6f..64b6c608eca433 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -66,10 +66,9 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, } #ifdef CONFIG_PPC_4K_PAGES -static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, - struct page *page, int writable) +static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { - size_t size = huge_page_size(hstate_vma(vma)); + size_t size = 1UL << shift; if (size == SZ_16K) return __pte(pte_val(entry) & ~_PAGE_HUGE); diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 550d3904de65b7..2cd80a0a97953f 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -377,8 +377,7 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot) #define pgprot_noncached pgprot_noncached #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) -extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, - struct page *page, int writable); +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags); #define arch_make_huge_pte arch_make_huge_pte static inline unsigned long __pte_default_huge_mask(void) { diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 04d8790f6c3259..0f49fada20938c 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -177,10 +177,8 @@ static pte_t hugepage_shift_to_tte(pte_t entry, unsigned int shift) return sun4u_hugepage_shift_to_tte(entry, shift); } -pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, - struct page *page, int writeable) +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { - unsigned int shift = huge_page_shift(hstate_vma(vma)); pte_t pte; pte = hugepage_shift_to_tte(entry, shift); @@ -188,7 +186,7 @@ pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, #ifdef CONFIG_SPARC64 /* If this vma has ADI enabled on it, turn on TTE.mcd */ - if (vma->vm_flags & VM_SPARC_ADI) + if (flags & VM_SPARC_ADI) return pte_mkmcd(pte); else return pte_mknotmcd(pte); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8c192084423687..cfde3bec22610d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -741,8 +741,8 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } #endif #ifndef arch_make_huge_pte -static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, - struct page *page, int writable) +static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, + vm_flags_t flags) { return entry; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd9c90c082fc41..88f2178ad7c992 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4060,6 +4060,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, int writable) { pte_t entry; + unsigned int shift = huge_page_shift(hstate_vma(vma)); if (writable) { entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, @@ -4070,7 +4071,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, } entry = pte_mkyoung(entry); entry = pte_mkhuge(entry); - entry = arch_make_huge_pte(entry, vma, page, writable); + entry = arch_make_huge_pte(entry, shift, vma->vm_flags); return entry; } @@ -5468,10 +5469,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, } if (!huge_pte_none(pte)) { pte_t old_pte; + unsigned int shift = huge_page_shift(hstate_vma(vma)); old_pte = huge_ptep_modify_prot_start(vma, address, ptep); pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); - pte = arch_make_huge_pte(pte, vma, NULL, 0); + pte = arch_make_huge_pte(pte, shift, vma->vm_flags); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; } diff --git a/mm/migrate.c b/mm/migrate.c index cc4d6af4168349..75a15f0a269876 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -226,8 +226,10 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { + unsigned int shift = huge_page_shift(hstate_vma(vma)); + pte = pte_mkhuge(pte); - pte = arch_make_huge_pte(pte, vma, new, 0); + pte = arch_make_huge_pte(pte, shift, vma->vm_flags); set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, pvmw.address); -- cgit 1.2.3-korg From 8cc5fcbb5be814c115085549b700e473685b11e9 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Wed, 30 Jun 2021 18:48:19 -0700 Subject: mm, hugetlb: fix racy resv_huge_pages underflow on UFFDIO_COPY On UFFDIO_COPY, if we fail to copy the page contents while holding the hugetlb_fault_mutex, we will drop the mutex and return to the caller after allocating a page that consumed a reservation. In this case there may be a fault that double consumes the reservation. To handle this, we free the allocated page, fix the reservations, and allocate a temporary hugetlb page and return that to the caller. When the caller does the copy outside of the lock, we again check the cache, and allocate a page consuming the reservation, and copy over the contents. Test: Hacked the code locally such that resv_huge_pages underflows produce a warning and the copy_huge_page_from_user() always fails, then: ./tools/testing/selftests/vm/userfaultfd hugetlb_shared 10 2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success ./tools/testing/selftests/vm/userfaultfd hugetlb 10 2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success Both tests succeed and produce no warnings. After the test runs number of free/resv hugepages is correct. [yuehaibing@huawei.com: remove set but not used variable 'vm_alloc_shared'] Link: https://lkml.kernel.org/r/20210601141610.28332-1-yuehaibing@huawei.com [almasrymina@google.com: fix allocation error check and copy func name] Link: https://lkml.kernel.org/r/20210605010626.1459873-1-almasrymina@google.com Link: https://lkml.kernel.org/r/20210528005029.88088-1-almasrymina@google.com Signed-off-by: Mina Almasry Signed-off-by: YueHaibing Cc: Axel Rasmussen Cc: Peter Xu Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 4 ++++ mm/hugetlb.c | 48 ++++++++++++++++++++++++++++++++++++++--------- mm/migrate.c | 2 +- mm/userfaultfd.c | 50 +------------------------------------------------ 4 files changed, 45 insertions(+), 59 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 4bb4e519e3f56d..7b7b7397727851 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -51,6 +51,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); +extern void copy_huge_page(struct page *dst, struct page *src); #else static inline void putback_movable_pages(struct list_head *l) {} @@ -77,6 +78,9 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, return -ENOSYS; } +static inline void copy_huge_page(struct page *dst, struct page *src) +{ +} #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_COMPACTION diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 88f2178ad7c992..b14f4d1749b215 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -5076,20 +5077,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct page **pagep) { bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); - struct address_space *mapping; - pgoff_t idx; + struct hstate *h = hstate_vma(dst_vma); + struct address_space *mapping = dst_vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); unsigned long size; int vm_shared = dst_vma->vm_flags & VM_SHARED; - struct hstate *h = hstate_vma(dst_vma); pte_t _dst_pte; spinlock_t *ptl; - int ret; + int ret = -ENOMEM; struct page *page; int writable; - mapping = dst_vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, dst_vma, dst_addr); - if (is_continue) { ret = -EFAULT; page = find_lock_page(mapping, idx); @@ -5118,12 +5116,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; + /* Free the allocated page which may have + * consumed a reservation. + */ + restore_reserve_on_error(h, dst_vma, dst_addr, page); + put_page(page); + + /* Allocate a temporary page to hold the copied + * contents. + */ + page = alloc_huge_page_vma(h, dst_vma, dst_addr); + if (!page) { + ret = -ENOMEM; + goto out; + } *pagep = page; - /* don't free the page */ + /* Set the outparam pagep and return to the caller to + * copy the contents outside the lock. Don't free the + * page. + */ goto out; } } else { - page = *pagep; + if (vm_shared && + hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + put_page(*pagep); + ret = -EEXIST; + *pagep = NULL; + goto out; + } + + page = alloc_huge_page(dst_vma, dst_addr, 0); + if (IS_ERR(page)) { + ret = -ENOMEM; + *pagep = NULL; + goto out; + } + copy_huge_page(page, *pagep); + put_page(*pagep); *pagep = NULL; } diff --git a/mm/migrate.c b/mm/migrate.c index 75a15f0a269876..8fc766e52e527e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -553,7 +553,7 @@ static void __copy_gigantic_page(struct page *dst, struct page *src, } } -static void copy_huge_page(struct page *dst, struct page *src) +void copy_huge_page(struct page *dst, struct page *src) { int i; int nr_pages; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 63a73e164d5510..da5535d2d99033 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -209,7 +209,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long len, enum mcopy_atomic_mode mode) { - int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; pte_t *dst_pte; @@ -308,7 +307,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); - vm_alloc_shared = vm_shared; cond_resched(); @@ -346,54 +344,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, out_unlock: mmap_read_unlock(dst_mm); out: - if (page) { - /* - * We encountered an error and are about to free a newly - * allocated huge page. - * - * Reservation handling is very subtle, and is different for - * private and shared mappings. See the routine - * restore_reserve_on_error for details. Unfortunately, we - * can not call restore_reserve_on_error now as it would - * require holding mmap_lock. - * - * If a reservation for the page existed in the reservation - * map of a private mapping, the map was modified to indicate - * the reservation was consumed when the page was allocated. - * We clear the HPageRestoreReserve flag now so that the global - * reserve count will not be incremented in free_huge_page. - * The reservation map will still indicate the reservation - * was consumed and possibly prevent later page allocation. - * This is better than leaking a global reservation. If no - * reservation existed, it is still safe to clear - * HPageRestoreReserve as no adjustments to reservation counts - * were made during allocation. - * - * The reservation map for shared mappings indicates which - * pages have reservations. When a huge page is allocated - * for an address with a reservation, no change is made to - * the reserve map. In this case HPageRestoreReserve will be - * set to indicate that the global reservation count should be - * incremented when the page is freed. This is the desired - * behavior. However, when a huge page is allocated for an - * address without a reservation a reservation entry is added - * to the reservation map, and HPageRestoreReserve will not be - * set. When the page is freed, the global reserve count will - * NOT be incremented and it will appear as though we have - * leaked reserved page. In this case, set HPageRestoreReserve - * so that the global reserve count will be incremented to - * match the reservation map entry which was created. - * - * Note that vm_alloc_shared is based on the flags of the vma - * for which the page was originally allocated. dst_vma could - * be different or NULL on error. - */ - if (vm_alloc_shared) - SetHPageRestoreReserve(page); - else - ClearHPageRestoreReserve(page); + if (page) put_page(page); - } BUG_ON(copied < 0); BUG_ON(err > 0); BUG_ON(!copied && !err); -- cgit 1.2.3-korg From 48b8d744ea841b8adf8d07bfe7a2d55f22e4d179 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 30 Jun 2021 18:48:31 -0700 Subject: hugetlb: remove prep_compound_huge_page cleanup Patch series "Fix prep_compound_gigantic_page ref count adjustment". These patches address the possible race between prep_compound_gigantic_page and __page_cache_add_speculative as described by Jann Horn in [1]. The first patch simply removes the unnecessary/obsolete helper routine prep_compound_huge_page to make the actual fix a little simpler. The second patch is the actual fix and has a detailed explanation in the commit message. This potential issue has existed for almost 10 years and I am unaware of anyone actually hitting the race. I did not cc stable, but would be happy to squash the patches and send to stable if anyone thinks that is a good idea. [1] https://lore.kernel.org/linux-mm/CAG48ez23q0Jy9cuVnwAe7t_fdhMk2S7N5Hdi-GLcCeq5bsfLxw@mail.gmail.com/ This patch (of 2): I could not think of a reliable way to recreate the issue for testing. Rather, I 'simulated errors' to exercise all the error paths. The routine prep_compound_huge_page is a simple wrapper to call either prep_compound_gigantic_page or prep_compound_page. However, it is only called from gather_bootmem_prealloc which only processes gigantic pages. Eliminate the routine and call prep_compound_gigantic_page directly. Link: https://lkml.kernel.org/r/20210622021423.154662-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20210622021423.154662-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Jan Kara Cc: Jann Horn Cc: John Hubbard Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Youquan Song Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b14f4d1749b215..8048763e98a7ca 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1320,8 +1320,6 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); } -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); -static void prep_compound_gigantic_page(struct page *page, unsigned int order); #else /* !CONFIG_CONTIG_ALLOC */ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) @@ -2759,16 +2757,10 @@ int __alloc_bootmem_huge_page(struct hstate *h) return 1; } -static void __init prep_compound_huge_page(struct page *page, - unsigned int order) -{ - if (unlikely(order > (MAX_ORDER - 1))) - prep_compound_gigantic_page(page, order); - else - prep_compound_page(page, order); -} - -/* Put bootmem huge pages into the standard lists after mem_map is up */ +/* + * Put bootmem huge pages into the standard lists after mem_map is up. + * Note: This only applies to gigantic (order > MAX_ORDER) pages. + */ static void __init gather_bootmem_prealloc(void) { struct huge_bootmem_page *m; @@ -2777,20 +2769,19 @@ static void __init gather_bootmem_prealloc(void) struct page *page = virt_to_page(m); struct hstate *h = m->hstate; + VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(page_count(page) != 1); - prep_compound_huge_page(page, huge_page_order(h)); + prep_compound_gigantic_page(page, huge_page_order(h)); WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); put_page(page); /* free it into the hugepage allocator */ /* - * If we had gigantic hugepages allocated at boot time, we need - * to restore the 'stolen' pages to totalram_pages in order to - * fix confusing memory reports from free(1) and another - * side-effects, like CommitLimit going negative. + * We need to restore the 'stolen' pages to totalram_pages + * in order to fix confusing memory reports from free(1) and + * other side-effects, like CommitLimit going negative. */ - if (hstate_is_gigantic(h)) - adjust_managed_page_count(page, pages_per_huge_page(h)); + adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } } -- cgit 1.2.3-korg From 7118fc2906e2925d7edb5ed9c8a57f2a5f23b849 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 30 Jun 2021 18:48:34 -0700 Subject: hugetlb: address ref count racing in prep_compound_gigantic_page In [1], Jann Horn points out a possible race between prep_compound_gigantic_page and __page_cache_add_speculative. The root cause of the possible race is prep_compound_gigantic_page uncondittionally setting the ref count of pages to zero. It does this because prep_compound_gigantic_page is handed a 'group' of pages from an allocator and needs to convert that group of pages to a compound page. The ref count of each page in this 'group' is one as set by the allocator. However, the ref count of compound page tail pages must be zero. The potential race comes about when ref counted pages are returned from the allocator. When this happens, other mm code could also take a reference on the page. __page_cache_add_speculative is one such example. Therefore, prep_compound_gigantic_page can not just set the ref count of pages to zero as it does today. Doing so would lose the reference taken by any other code. This would lead to BUGs in code checking ref counts and could possibly even lead to memory corruption. There are two possible ways to address this issue. 1) Make all allocators of gigantic groups of pages be able to return a properly constructed compound page. 2) Make prep_compound_gigantic_page be more careful when constructing a compound page. This patch takes approach 2. In prep_compound_gigantic_page, use cmpxchg to only set ref count to zero if it is one. If the cmpxchg fails, call synchronize_rcu() in the hope that the extra ref count will be driopped during a rcu grace period. This is not a performance critical code path and the wait should be accceptable. If the ref count is still inflated after the grace period, then undo any modifications made and return an error. Currently prep_compound_gigantic_page is type void and does not return errors. Modify the two callers to check for and handle error returns. On error, the caller must free the 'group' of pages as they can not be used to form a gigantic page. After freeing pages, the runtime caller (alloc_fresh_huge_page) will retry the allocation once. Boot time allocations can not be retried. The routine prep_compound_page also unconditionally sets the ref count of compound page tail pages to zero. However, in this case the buddy allocator is constructing a compound page from freshly allocated pages. The ref count on those freshly allocated pages is already zero, so the set_page_count(p, 0) is unnecessary and could lead to confusion. Just remove it. [1] https://lore.kernel.org/linux-mm/CAG48ez23q0Jy9cuVnwAe7t_fdhMk2S7N5Hdi-GLcCeq5bsfLxw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20210622021423.154662-3-mike.kravetz@oracle.com Fixes: 58a84aa92723 ("thp: set compound tail page _count to zero") Signed-off-by: Mike Kravetz Reported-by: Jann Horn Cc: Youquan Song Cc: Andrea Arcangeli Cc: Jan Kara Cc: John Hubbard Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++------- mm/page_alloc.c | 1 - 2 files changed, 64 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8048763e98a7ca..89ba5147206e6d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1623,9 +1623,9 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) spin_unlock_irq(&hugetlb_lock); } -static void prep_compound_gigantic_page(struct page *page, unsigned int order) +static bool prep_compound_gigantic_page(struct page *page, unsigned int order) { - int i; + int i, j; int nr_pages = 1 << order; struct page *p = page + 1; @@ -1647,11 +1647,48 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) * after get_user_pages(). */ __ClearPageReserved(p); + /* + * Subtle and very unlikely + * + * Gigantic 'page allocators' such as memblock or cma will + * return a set of pages with each page ref counted. We need + * to turn this set of pages into a compound page with tail + * page ref counts set to zero. Code such as speculative page + * cache adding could take a ref on a 'to be' tail page. + * We need to respect any increased ref count, and only set + * the ref count to zero if count is currently 1. If count + * is not 1, we call synchronize_rcu in the hope that a rcu + * grace period will cause ref count to drop and then retry. + * If count is still inflated on retry we return an error and + * must discard the pages. + */ + if (!page_ref_freeze(p, 1)) { + pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); + synchronize_rcu(); + if (!page_ref_freeze(p, 1)) + goto out_error; + } set_page_count(p, 0); set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); atomic_set(compound_pincount_ptr(page), 0); + return true; + +out_error: + /* undo tail page modifications made above */ + p = page + 1; + for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) { + clear_compound_head(p); + set_page_refcounted(p); + } + /* need to clear PG_reserved on remaining tail pages */ + for (; j < nr_pages; j++, p = mem_map_next(p, page, j)) + __ClearPageReserved(p); + set_compound_order(page, 0); + page[1].compound_nr = 0; + __ClearPageHead(page); + return false; } /* @@ -1771,7 +1808,9 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, nodemask_t *node_alloc_noretry) { struct page *page; + bool retry = false; +retry: if (hstate_is_gigantic(h)) page = alloc_gigantic_page(h, gfp_mask, nid, nmask); else @@ -1780,8 +1819,21 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, if (!page) return NULL; - if (hstate_is_gigantic(h)) - prep_compound_gigantic_page(page, huge_page_order(h)); + if (hstate_is_gigantic(h)) { + if (!prep_compound_gigantic_page(page, huge_page_order(h))) { + /* + * Rare failure to convert pages to compound page. + * Free pages and try again - ONCE! + */ + free_gigantic_page(page, huge_page_order(h)); + if (!retry) { + retry = true; + goto retry; + } + pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); + return NULL; + } + } prep_new_huge_page(h, page, page_to_nid(page)); return page; @@ -2771,10 +2823,14 @@ static void __init gather_bootmem_prealloc(void) VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(page_count(page) != 1); - prep_compound_gigantic_page(page, huge_page_order(h)); - WARN_ON(PageReserved(page)); - prep_new_huge_page(h, page, page_to_nid(page)); - put_page(page); /* free it into the hugepage allocator */ + if (prep_compound_gigantic_page(page, huge_page_order(h))) { + WARN_ON(PageReserved(page)); + prep_new_huge_page(h, page, page_to_nid(page)); + put_page(page); /* add to the hugepage allocator */ + } else { + free_gigantic_page(page, huge_page_order(h)); + pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); + } /* * We need to restore the 'stolen' pages to totalram_pages diff --git a/mm/page_alloc.c b/mm/page_alloc.c index db00ee8d79d210..eeff6484371824 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -754,7 +754,6 @@ void prep_compound_page(struct page *page, unsigned int order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - set_page_count(p, 0); p->mapping = TAIL_MAPPING; set_compound_head(p, page); } -- cgit 1.2.3-korg From 4dd845b5a3e57ad07f26ef808707b064696fe34b Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 30 Jun 2021 18:54:09 -0700 Subject: mm/swapops: rework swap entry manipulation code Both migration and device private pages use special swap entries that are manipluated by a range of inline functions. The arguments to these are somewhat inconsistent so rework them to remove flag type arguments and to make the arguments similar for both read and write entry creation. Link: https://lkml.kernel.org/r/20210616105937.23201-3-apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Ralph Campbell Cc: Ben Skeggs Cc: Hugh Dickins Cc: John Hubbard Cc: "Matthew Wilcox (Oracle)" Cc: Peter Xu Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swapops.h | 56 ++++++++++++++++++++++++++----------------------- mm/debug_vm_pgtable.c | 12 +++++------ mm/hmm.c | 2 +- mm/huge_memory.c | 26 ++++++++++++++++------- mm/hugetlb.c | 10 +++++---- mm/memory.c | 10 +++++---- mm/migrate.c | 26 +++++++++++++++++------ mm/mprotect.c | 10 +++++---- mm/rmap.c | 10 ++++++--- 9 files changed, 100 insertions(+), 62 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index c24c79812bc1dd..04d76357aa0c1a 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -107,35 +107,35 @@ static inline void *swp_to_radix_entry(swp_entry_t entry) } #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) -static inline swp_entry_t make_device_private_entry(struct page *page, bool write) +static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { - return swp_entry(write ? SWP_DEVICE_WRITE : SWP_DEVICE_READ, - page_to_pfn(page)); + return swp_entry(SWP_DEVICE_READ, offset); } -static inline bool is_device_private_entry(swp_entry_t entry) +static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { - int type = swp_type(entry); - return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE; + return swp_entry(SWP_DEVICE_WRITE, offset); } -static inline void make_device_private_entry_read(swp_entry_t *entry) +static inline bool is_device_private_entry(swp_entry_t entry) { - *entry = swp_entry(SWP_DEVICE_READ, swp_offset(*entry)); + int type = swp_type(entry); + return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE; } -static inline bool is_write_device_private_entry(swp_entry_t entry) +static inline bool is_writable_device_private_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); } #else /* CONFIG_DEVICE_PRIVATE */ -static inline swp_entry_t make_device_private_entry(struct page *page, bool write) +static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { return swp_entry(0, 0); } -static inline void make_device_private_entry_read(swp_entry_t *entry) +static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { + return swp_entry(0, 0); } static inline bool is_device_private_entry(swp_entry_t entry) @@ -143,35 +143,32 @@ static inline bool is_device_private_entry(swp_entry_t entry) return false; } -static inline bool is_write_device_private_entry(swp_entry_t entry) +static inline bool is_writable_device_private_entry(swp_entry_t entry) { return false; } #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION -static inline swp_entry_t make_migration_entry(struct page *page, int write) -{ - BUG_ON(!PageLocked(compound_head(page))); - - return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ, - page_to_pfn(page)); -} - static inline int is_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ || swp_type(entry) == SWP_MIGRATION_WRITE); } -static inline int is_write_migration_entry(swp_entry_t entry) +static inline int is_writable_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); } -static inline void make_migration_entry_read(swp_entry_t *entry) +static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { - *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); + return swp_entry(SWP_MIGRATION_READ, offset); +} + +static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) +{ + return swp_entry(SWP_MIGRATION_WRITE, offset); } extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, @@ -181,21 +178,28 @@ extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, extern void migration_entry_wait_huge(struct vm_area_struct *vma, struct mm_struct *mm, pte_t *pte); #else +static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) +{ + return swp_entry(0, 0); +} + +static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) +{ + return swp_entry(0, 0); +} -#define make_migration_entry(page, write) swp_entry(0, 0) static inline int is_migration_entry(swp_entry_t swp) { return 0; } -static inline void make_migration_entry_read(swp_entry_t *entryp) { } static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, struct mm_struct *mm, pte_t *pte) { } -static inline int is_write_migration_entry(swp_entry_t entry) +static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; } diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index f7b23565a04f09..1c922691aa616e 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -843,17 +843,17 @@ static void __init swap_migration_tests(void) * locked, otherwise it stumbles upon a BUG_ON(). */ __SetPageLocked(page); - swp = make_migration_entry(page, 1); + swp = make_writable_migration_entry(page_to_pfn(page)); WARN_ON(!is_migration_entry(swp)); - WARN_ON(!is_write_migration_entry(swp)); + WARN_ON(!is_writable_migration_entry(swp)); - make_migration_entry_read(&swp); + swp = make_readable_migration_entry(swp_offset(swp)); WARN_ON(!is_migration_entry(swp)); - WARN_ON(is_write_migration_entry(swp)); + WARN_ON(is_writable_migration_entry(swp)); - swp = make_migration_entry(page, 0); + swp = make_readable_migration_entry(page_to_pfn(page)); WARN_ON(!is_migration_entry(swp)); - WARN_ON(is_write_migration_entry(swp)); + WARN_ON(is_writable_migration_entry(swp)); __ClearPageLocked(page); __free_page(page); } diff --git a/mm/hmm.c b/mm/hmm.c index 3b2dda71d0ed19..11df3ca30b82f0 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -255,7 +255,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, */ if (hmm_is_device_private_entry(range, entry)) { cpu_flags = HMM_PFN_VALID; - if (is_write_device_private_entry(entry)) + if (is_writable_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; *hmm_pfn = swp_offset(entry) | cpu_flags; return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 327b8d9d8d2f01..1c81aa11d61880 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1054,8 +1054,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, swp_entry_t entry = pmd_to_swp_entry(pmd); VM_BUG_ON(!is_pmd_migration_entry(pmd)); - if (is_write_migration_entry(entry)) { - make_migration_entry_read(&entry); + if (is_writable_migration_entry(entry)) { + entry = make_readable_migration_entry( + swp_offset(entry)); pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd)) pmd = pmd_swp_mksoft_dirty(pmd); @@ -1772,13 +1773,14 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry = pmd_to_swp_entry(*pmd); VM_BUG_ON(!is_pmd_migration_entry(*pmd)); - if (is_write_migration_entry(entry)) { + if (is_writable_migration_entry(entry)) { pmd_t newpmd; /* * A protection check is difficult so * just be safe and disable write */ - make_migration_entry_read(&entry); + entry = make_readable_migration_entry( + swp_offset(entry)); newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); @@ -2067,7 +2069,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_to_swp_entry(old_pmd); page = pfn_swap_entry_to_page(entry); - write = is_write_migration_entry(entry); + write = is_writable_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); @@ -2099,7 +2101,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, */ if (freeze || pmd_migration) { swp_entry_t swp_entry; - swp_entry = make_migration_entry(page + i, write); + if (write) + swp_entry = make_writable_migration_entry( + page_to_pfn(page + i)); + else + swp_entry = make_readable_migration_entry( + page_to_pfn(page + i)); entry = swp_entry_to_pte(swp_entry); if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); @@ -3171,7 +3178,10 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, pmdval = pmdp_invalidate(vma, address, pvmw->pmd); if (pmd_dirty(pmdval)) set_page_dirty(page); - entry = make_migration_entry(page, pmd_write(pmdval)); + if (pmd_write(pmdval)) + entry = make_writable_migration_entry(page_to_pfn(page)); + else + entry = make_readable_migration_entry(page_to_pfn(page)); pmdswp = swp_entry_to_pmd(entry); if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); @@ -3197,7 +3207,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); - if (is_write_migration_entry(entry)) + if (is_writable_migration_entry(entry)) pmde = maybe_pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 89ba5147206e6d..924553aa8f789a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4242,12 +4242,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, is_hugetlb_entry_hwpoisoned(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); - if (is_write_migration_entry(swp_entry) && cow) { + if (is_writable_migration_entry(swp_entry) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. */ - make_migration_entry_read(&swp_entry); + swp_entry = make_readable_migration_entry( + swp_offset(swp_entry)); entry = swp_entry_to_pte(swp_entry); set_huge_swap_pte_at(src, addr, src_pte, entry, sz); @@ -5532,10 +5533,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, if (unlikely(is_hugetlb_entry_migration(pte))) { swp_entry_t entry = pte_to_swp_entry(pte); - if (is_write_migration_entry(entry)) { + if (is_writable_migration_entry(entry)) { pte_t newpte; - make_migration_entry_read(&entry); + entry = make_readable_migration_entry( + swp_offset(entry)); newpte = swp_entry_to_pte(entry); set_huge_swap_pte_at(mm, address, ptep, newpte, huge_page_size(h)); diff --git a/mm/memory.c b/mm/memory.c index 6723931085c776..e30488e9202f19 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -733,13 +733,14 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, rss[mm_counter(page)]++; - if (is_write_migration_entry(entry) && + if (is_writable_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both * parent and child to be set to read. */ - make_migration_entry_read(&entry); + entry = make_readable_migration_entry( + swp_offset(entry)); pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(*src_pte)) pte = pte_swp_mksoft_dirty(pte); @@ -770,9 +771,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * when a device driver is involved (you cannot easily * save and restore device driver state). */ - if (is_write_device_private_entry(entry) && + if (is_writable_device_private_entry(entry) && is_cow_mapping(vm_flags)) { - make_device_private_entry_read(&entry); + entry = make_readable_device_private_entry( + swp_offset(entry)); pte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(*src_pte)) pte = pte_swp_mkuffd_wp(pte); diff --git a/mm/migrate.c b/mm/migrate.c index b4abb87249e166..de47bdfbd2f8a3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -210,13 +210,18 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, * Recheck VMA as permissions can change since migration started */ entry = pte_to_swp_entry(*pvmw.pte); - if (is_write_migration_entry(entry)) + if (is_writable_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); else if (pte_swp_uffd_wp(*pvmw.pte)) pte = pte_mkuffd_wp(pte); if (unlikely(is_device_private_page(new))) { - entry = make_device_private_entry(new, pte_write(pte)); + if (pte_write(pte)) + entry = make_writable_device_private_entry( + page_to_pfn(new)); + else + entry = make_readable_device_private_entry( + page_to_pfn(new)); pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(*pvmw.pte)) pte = pte_swp_mksoft_dirty(pte); @@ -2297,7 +2302,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, mpfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; - if (is_write_device_private_entry(entry)) + if (is_writable_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) @@ -2343,8 +2348,12 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, ptep_get_and_clear(mm, addr, ptep); /* Setup special migration page table entry */ - entry = make_migration_entry(page, mpfn & - MIGRATE_PFN_WRITE); + if (mpfn & MIGRATE_PFN_WRITE) + entry = make_writable_migration_entry( + page_to_pfn(page)); + else + entry = make_readable_migration_entry( + page_to_pfn(page)); swp_pte = swp_entry_to_pte(entry); if (pte_present(pte)) { if (pte_soft_dirty(pte)) @@ -2817,7 +2826,12 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (is_device_private_page(page)) { swp_entry_t swp_entry; - swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); entry = swp_entry_to_pte(swp_entry); } else { /* diff --git a/mm/mprotect.c b/mm/mprotect.c index e7a443157988df..ee5961888e7075 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -143,23 +143,25 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry = pte_to_swp_entry(oldpte); pte_t newpte; - if (is_write_migration_entry(entry)) { + if (is_writable_migration_entry(entry)) { /* * A protection check is difficult so * just be safe and disable write */ - make_migration_entry_read(&entry); + entry = make_readable_migration_entry( + swp_offset(entry)); newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); - } else if (is_write_device_private_entry(entry)) { + } else if (is_writable_device_private_entry(entry)) { /* * We do not preserve soft-dirtiness. See * copy_one_pte() for explanation. */ - make_device_private_entry_read(&entry); + entry = make_readable_device_private_entry( + swp_offset(entry)); newpte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); diff --git a/mm/rmap.c b/mm/rmap.c index f9fd5bc54f0a37..b9986c8db5249b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1533,7 +1533,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - entry = make_migration_entry(page, 0); + entry = make_readable_migration_entry(page_to_pfn(page)); swp_pte = swp_entry_to_pte(entry); /* @@ -1629,8 +1629,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - entry = make_migration_entry(subpage, - pte_write(pteval)); + if (pte_write(pteval)) + entry = make_writable_migration_entry( + page_to_pfn(subpage)); + else + entry = make_readable_migration_entry( + page_to_pfn(subpage)); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); -- cgit 1.2.3-korg