From 9b94b5a2f9a95d693cfa8db6e34dcb3f1cd91204 Mon Sep 17 00:00:00 2001 From: Hongfu Li Date: Tue, 18 Jun 2024 09:45:17 +0800 Subject: khugepaged: simplify the allocation of slab caches Use the new KMEM_CACHE() macro instead of direct kmem_cache_create to simplify the creation of SLAB caches. Link: https://lkml.kernel.org/r/20240618014517.25954-1-lihongfu@kylinos.cn Signed-off-by: Hongfu Li Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/khugepaged.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm/khugepaged.c') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 774a97e6e2da39..f8d08b49420cc6 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -385,10 +385,7 @@ int hugepage_madvise(struct vm_area_struct *vma, int __init khugepaged_init(void) { - mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", - sizeof(struct khugepaged_mm_slot), - __alignof__(struct khugepaged_mm_slot), - 0, NULL); + mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0); if (!mm_slot_cache) return -ENOMEM; -- cgit 1.2.3-korg From 15bde4abab734c687c1f81704886aba3a70c268e Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 18 Jun 2024 11:11:35 +1200 Subject: mm: extend rmap flags arguments for folio_add_new_anon_rmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: clarify folio_add_new_anon_rmap() and __folio_add_anon_rmap()", v2. This patchset is preparatory work for mTHP swapin. folio_add_new_anon_rmap() assumes that new anon rmaps are always exclusive. However, this assumption doesn’t hold true for cases like do_swap_page(), where a new anon might be added to the swapcache and is not necessarily exclusive. The patchset extends the rmap flags to allow folio_add_new_anon_rmap() to handle both exclusive and non-exclusive new anon folios. The do_swap_page() function is updated to use this extended API with rmap flags. Consequently, all new anon folios now consistently use folio_add_new_anon_rmap(). The special case for !folio_test_anon() in __folio_add_anon_rmap() can be safely removed. In conclusion, new anon folios always use folio_add_new_anon_rmap(), regardless of exclusivity. Old anon folios continue to use __folio_add_anon_rmap() via folio_add_anon_rmap_pmd() and folio_add_anon_rmap_ptes(). This patch (of 3): In the case of a swap-in, a new anonymous folio is not necessarily exclusive. This patch updates the rmap flags to allow a new anonymous folio to be treated as either exclusive or non-exclusive. To maintain the existing behavior, we always use EXCLUSIVE as the default setting. [akpm@linux-foundation.org: cleanup and constifications per David and akpm] [v-songbaohua@oppo.com: fix missing doc for flags of folio_add_new_anon_rmap()] Link: https://lkml.kernel.org/r/20240619210641.62542-1-21cnbao@gmail.com [v-songbaohua@oppo.com: enhance doc for extend rmap flags arguments for folio_add_new_anon_rmap] Link: https://lkml.kernel.org/r/20240622030256.43775-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-2-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: David Hildenbrand Tested-by: Shuai Yuan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/memory.c | 10 +++++----- mm/migrate_device.c | 2 +- mm/rmap.c | 25 ++++++++++++++++--------- mm/swapfile.c | 2 +- mm/userfaultfd.c | 2 +- 9 files changed, 28 insertions(+), 21 deletions(-) (limited to 'mm/khugepaged.c') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 980fa5d75d69c2..0978c64f49d833 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -244,7 +244,7 @@ void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, void folio_add_anon_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, - unsigned long address); + unsigned long address, rmap_t flags); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_add_file_rmap_pte(folio, page, vma) \ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2c83ba776fc7b2..c20368aa33ddd8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { folio_get(new_folio); - folio_add_new_anon_rmap(new_folio, vma, addr); + folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 46ba81240d9694..14a05c64380655 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -974,7 +974,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - folio_add_new_anon_rmap(folio, vma, haddr); + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f8d08b49420cc6..409f67a817f101 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1210,7 +1210,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - folio_add_new_anon_rmap(folio, vma, address); + folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); diff --git a/mm/memory.c b/mm/memory.c index d8a0b7d2e15bca..a4fc6e632d2c2a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -930,7 +930,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma *prealloc = NULL; copy_user_highpage(&new_folio->page, page, addr, src_vma); __folio_mark_uptodate(new_folio); - folio_add_new_anon_rmap(new_folio, dst_vma, addr); + folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, dst_vma); rss[MM_ANONPAGES]++; @@ -3402,7 +3402,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * some TLBs while the old PTE remains in others. */ ptep_clear_flush(vma, vmf->address, vmf->pte); - folio_add_new_anon_rmap(new_folio, vma, vmf->address); + folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); BUG_ON(unshare && pte_write(entry)); set_pte_at(mm, vmf->address, vmf->pte, entry); @@ -4339,7 +4339,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { - folio_add_new_anon_rmap(folio, vma, address); + folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, @@ -4594,7 +4594,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); #endif - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); setpte: if (vmf_orig_pte_uffd_wp(vmf)) @@ -4792,7 +4792,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { VM_BUG_ON_FOLIO(nr != 1, folio); - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_file_rmap_ptes(folio, page, nr, vma); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 051d0a3ccbee02..6d66dc1c6ffa06 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -658,7 +658,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); if (!folio_is_zone_device(folio)) folio_add_lru_vma(folio, vma); folio_get(folio); diff --git a/mm/rmap.c b/mm/rmap.c index 69cbd7ac2a5c87..c0c99f91ade1b1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1401,30 +1401,35 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, * @folio: The folio to add the mapping to. * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @flags: The rmap flags * * Like folio_add_anon_rmap_*() but must only be called on *new* folios. * This means the inc-and-test can be bypassed. - * The folio does not have to be locked. + * The folio doesn't necessarily need to be locked while it's exclusive + * unless two threads map it concurrently. However, the folio must be + * locked if it's shared. * - * If the folio is pmd-mappable, it is accounted as a THP. As the folio - * is new, it's assumed to be mapped exclusively by a single process. + * If the folio is pmd-mappable, it is accounted as a THP. */ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, - unsigned long address) + unsigned long address, rmap_t flags) { - int nr = folio_nr_pages(folio); + const int nr = folio_nr_pages(folio); + const bool exclusive = flags & RMAP_EXCLUSIVE; int nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio); VM_BUG_ON_VMA(address < vma->vm_start || address + (nr << PAGE_SHIFT) > vma->vm_end, vma); __folio_set_swapbacked(folio); - __folio_set_anon(folio, vma, address, true); + __folio_set_anon(folio, vma, address, exclusive); if (likely(!folio_test_large(folio))) { /* increment count (starts at -1) */ atomic_set(&folio->_mapcount, 0); - SetPageAnonExclusive(&folio->page); + if (exclusive) + SetPageAnonExclusive(&folio->page); } else if (!folio_test_pmd_mappable(folio)) { int i; @@ -1433,7 +1438,8 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); - SetPageAnonExclusive(page); + if (exclusive) + SetPageAnonExclusive(page); } /* increment count (starts at -1) */ @@ -1445,7 +1451,8 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, /* increment count (starts at -1) */ atomic_set(&folio->_large_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); - SetPageAnonExclusive(&folio->page); + if (exclusive) + SetPageAnonExclusive(&folio->page); nr_pmdmapped = nr; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 9c6d8e557c0fd5..ae1d2700f6a3e2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1911,7 +1911,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); } else { /* ksm created a completely new copy */ - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 5e7f2801698ac5..8dedaec0048634 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -216,7 +216,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { - folio_add_new_anon_rmap(folio, dst_vma, dst_addr); + folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, dst_vma); } -- cgit 1.2.3-korg From 00f58104202c472e487f0866fbd38832523fd4f9 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 4 Jul 2024 10:10:50 +0100 Subject: mm: fix khugepaged activation policy Since the introduction of mTHP, the docuementation has stated that khugepaged would be enabled when any mTHP size is enabled, and disabled when all mTHP sizes are disabled. There are 2 problems with this; 1. this is not what was implemented by the code and 2. this is not the desirable behavior. Desirable behavior is for khugepaged to be enabled when any PMD-sized THP is enabled, anon or file. (Note that file THP is still controlled by the top-level control so we must always consider that, as well as the PMD-size mTHP control for anon). khugepaged only supports collapsing to PMD-sized THP so there is no value in enabling it when PMD-sized THP is disabled. So let's change the code and documentation to reflect this policy. Further, per-size enabled control modification events were not previously forwarded to khugepaged to give it an opportunity to start or stop. Consequently the following was resulting in khugepaged eroneously not being activated: echo never > /sys/kernel/mm/transparent_hugepage/enabled echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled [ryan.roberts@arm.com: v3] Link: https://lkml.kernel.org/r/20240705102849.2479686-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240705102849.2479686-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240704091051.2411934-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Fixes: 3485b88390b0 ("mm: thp: introduce multi-size THP sysfs interface") Closes: https://lore.kernel.org/linux-mm/7a0bbe69-1e3d-4263-b206-da007791a5c4@redhat.com/ Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Barry Song Cc: Jonathan Corbet Cc: Lance Yang Cc: Yang Shi Cc: Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 11 +++++----- include/linux/huge_mm.h | 12 ----------- mm/huge_memory.c | 7 +++++++ mm/khugepaged.c | 33 +++++++++++++++++++++++------- 4 files changed, 38 insertions(+), 25 deletions(-) (limited to 'mm/khugepaged.c') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index a1bc9b24e29ab3..fe237825b95c62 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -202,12 +202,11 @@ PMD-mappable transparent hugepage:: cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size -khugepaged will be automatically started when one or more hugepage -sizes are enabled (either by directly setting "always" or "madvise", -or by setting "inherit" while the top-level enabled is set to "always" -or "madvise"), and it'll be automatically shutdown when the last -hugepage size is disabled (either by directly setting "never", or by -setting "inherit" while the top-level enabled is set to "never"). +khugepaged will be automatically started when PMD-sized THP is enabled +(either of the per-size anon control or the top-level control are set +to "always" or "madvise"), and it'll be automatically shutdown when +PMD-sized THP is disabled (when both the per-size anon control and the +top-level control are "never") Khugepaged controls ------------------- diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cee3c5da8f0ed4..acb6ac24a07e6f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -128,18 +128,6 @@ static inline bool hugepage_global_always(void) (1< 0) { + int err; + + err = start_stop_khugepaged(); + if (err) + ret = err; + } return ret; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 409f67a817f101..a5ec03ef87226d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -413,6 +413,26 @@ static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) test_bit(MMF_DISABLE_THP, &mm->flags); } +static bool hugepage_pmd_enabled(void) +{ + /* + * We cover both the anon and the file-backed case here; file-backed + * hugepages, when configured in, are determined by the global control. + * Anon pmd-sized hugepages are determined by the pmd-size control. + */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + hugepage_global_enabled()) + return true; + if (test_bit(PMD_ORDER, &huge_anon_orders_always)) + return true; + if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) + return true; + if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && + hugepage_global_enabled()) + return true; + return false; +} + void __khugepaged_enter(struct mm_struct *mm) { struct khugepaged_mm_slot *mm_slot; @@ -449,7 +469,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - hugepage_flags_enabled()) { + hugepage_pmd_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); @@ -2462,8 +2482,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, static int khugepaged_has_work(void) { - return !list_empty(&khugepaged_scan.mm_head) && - hugepage_flags_enabled(); + return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled(); } static int khugepaged_wait_event(void) @@ -2536,7 +2555,7 @@ static void khugepaged_wait_work(void) return; } - if (hugepage_flags_enabled()) + if (hugepage_pmd_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } @@ -2567,7 +2586,7 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - if (!hugepage_flags_enabled()) { + if (!hugepage_pmd_enabled()) { calculate_min_free_kbytes(); goto update_wmarks; } @@ -2617,7 +2636,7 @@ int start_stop_khugepaged(void) int err = 0; mutex_lock(&khugepaged_mutex); - if (hugepage_flags_enabled()) { + if (hugepage_pmd_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); @@ -2643,7 +2662,7 @@ int start_stop_khugepaged(void) void khugepaged_min_free_kbytes_update(void) { mutex_lock(&khugepaged_mutex); - if (hugepage_flags_enabled() && khugepaged_thread) + if (hugepage_pmd_enabled() && khugepaged_thread) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } -- cgit 1.2.3-korg