From 6f775463d0027733caebfb75d62ec7c4f807f834 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 15 May 2024 15:07:09 +0800 Subject: mm: shmem: use folio_alloc_mpol() in shmem_alloc_folio() Let's change shmem_alloc_folio() to take a order and use folio_alloc_mpol() helper, then directly use it for normal or large folio to cleanup code. Link: https://lkml.kernel.org/r/20240515070709.78529-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index a8b181a6340296..b2dce4103f0e78 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1603,32 +1603,18 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) return result; } -static struct folio *shmem_alloc_hugefolio(gfp_t gfp, +static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; - struct page *page; - - mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx); - page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id()); - mpol_cond_put(mpol); - - return page_rmappable_folio(page); -} - -static struct folio *shmem_alloc_folio(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - struct mempolicy *mpol; - pgoff_t ilx; - struct page *page; + struct folio *folio; - mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); - page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id()); + mpol = shmem_get_pgoff_policy(info, index, order, &ilx); + folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); mpol_cond_put(mpol); - return (struct folio *)page; + return folio; } static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, @@ -1660,12 +1646,12 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, index + HPAGE_PMD_NR - 1, XA_PRESENT)) return ERR_PTR(-E2BIG); - folio = shmem_alloc_hugefolio(gfp, info, index); + folio = shmem_alloc_folio(gfp, HPAGE_PMD_ORDER, info, index); if (!folio) count_vm_event(THP_FILE_FALLBACK); } else { pages = 1; - folio = shmem_alloc_folio(gfp, info, index); + folio = shmem_alloc_folio(gfp, 0, info, index); } if (!folio) return ERR_PTR(-ENOMEM); @@ -1765,7 +1751,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, */ gfp &= ~GFP_CONSTRAINT_MASK; VM_BUG_ON_FOLIO(folio_test_large(old), old); - new = shmem_alloc_folio(gfp, info, index); + new = shmem_alloc_folio(gfp, 0, info, index); if (!new) return -ENOMEM; @@ -2633,7 +2619,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; - folio = shmem_alloc_folio(gfp, info, pgoff); + folio = shmem_alloc_folio(gfp, 0, info, pgoff); if (!folio) goto out_unacct_blocks; -- cgit 1.2.3-korg From 7aad25b4b47ea5b67e1eb8be0db211b899dce60d Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:53 +0800 Subject: mm/swap: reduce swap cache search space Currently we use one swap_address_space for every 64M chunk to reduce lock contention, this is like having a set of smaller swap files inside one swap device. But when doing swap cache look up or insert, we are still using the offset of the whole large swap device. This is OK for correctness, as the offset (key) is unique. But Xarray is specially optimized for small indexes, it creates the radix tree levels lazily to be just enough to fit the largest key stored in one Xarray. So we are wasting tree nodes unnecessarily. For 64M chunk it should only take at most 3 levels to contain everything. But if we are using the offset from the whole swap device, the offset (key) value will be way beyond 64M, and so will the tree level. Optimize this by using a new helper swap_cache_index to get a swap entry's unique offset in its own 64M swap_address_space. I see a ~1% performance gain in benchmark and actual workload with high memory pressure. Test with `time memhog 128G` inside a 8G memcg using 128G swap (ramdisk with SWP_SYNCHRONOUS_IO dropped, tested 3 times, results are stable. The test result is similar but the improvement is smaller if SWP_SYNCHRONOUS_IO is enabled, as swap out path can never skip swap cache): Before: 6.07user 250.74system 4:17.26elapsed 99%CPU (0avgtext+0avgdata 8373376maxresident)k 0inputs+0outputs (55major+33555018minor)pagefaults 0swaps After (1.8% faster): 6.08user 246.09system 4:12.58elapsed 99%CPU (0avgtext+0avgdata 8373248maxresident)k 0inputs+0outputs (54major+33555027minor)pagefaults 0swaps Similar result with MySQL and sysbench using swap: Before: 94055.61 qps After (0.8% faster): 94834.91 qps Radix tree slab usage is also very slightly lower. Link: https://lkml.kernel.org/r/20240521175854.96038-12-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- mm/memcontrol.c | 2 +- mm/mincore.c | 2 +- mm/shmem.c | 2 +- mm/swap.h | 15 +++++++++++++++ mm/swap_state.c | 17 +++++++++-------- mm/swapfile.c | 6 +++--- 7 files changed, 31 insertions(+), 15 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index db7946a0a28c4b..cff47b3fc72ef6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2838,7 +2838,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, split_page_memcg(head, order, new_order); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { - offset = swp_offset(folio->swap); + offset = swap_cache_index(folio->swap); swap_cache = swap_address_space(folio->swap); xa_lock(&swap_cache->i_pages); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 71fe2a95b8bd33..86a741d8a37d4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6146,7 +6146,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * Because swap_cache_get_folio() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ - page = find_get_page(swap_address_space(ent), swp_offset(ent)); + page = find_get_page(swap_address_space(ent), swap_cache_index(ent)); entry->val = ent.val; return page; diff --git a/mm/mincore.c b/mm/mincore.c index dad3622cc963e8..e31cf1bde61412 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -139,7 +139,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } else { #ifdef CONFIG_SWAP *vec = mincore_page(swap_address_space(entry), - swp_offset(entry)); + swap_cache_index(entry)); #else WARN_ON(1); *vec = 1; diff --git a/mm/shmem.c b/mm/shmem.c index b2dce4103f0e78..3e7d60d71d3124 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1742,7 +1742,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, old = *foliop; entry = old->swap; - swap_index = swp_offset(entry); + swap_index = swap_cache_index(entry); swap_mapping = swap_address_space(entry); /* diff --git a/mm/swap.h b/mm/swap.h index 82023ab93205f4..2c0e96272d498d 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -27,6 +27,7 @@ void __swap_writepage(struct folio *folio, struct writeback_control *wbc); /* One swap address space for each 64M swap space */ #define SWAP_ADDRESS_SPACE_SHIFT 14 #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) +#define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) extern struct address_space *swapper_spaces[]; #define swap_address_space(entry) \ (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ @@ -40,6 +41,15 @@ static inline loff_t swap_dev_pos(swp_entry_t entry) return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; } +/* + * Return the swap cache index of the swap entry. + */ +static inline pgoff_t swap_cache_index(swp_entry_t entry) +{ + BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK); + return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; +} + void show_swap_cache_info(void); bool add_to_swap(struct folio *folio); void *get_shadow_from_swap_cache(swp_entry_t entry); @@ -86,6 +96,11 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) return NULL; } +static inline pgoff_t swap_cache_index(swp_entry_t entry) +{ + return 0; +} + static inline void show_swap_cache_info(void) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 6498491e3ad8d8..a5dae40523ab69 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -72,7 +72,7 @@ void show_swap_cache_info(void) void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swp_offset(entry); + pgoff_t idx = swap_cache_index(entry); void *shadow; shadow = xa_load(&address_space->i_pages, idx); @@ -89,7 +89,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swp_offset(entry); + pgoff_t idx = swap_cache_index(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); unsigned long i, nr = folio_nr_pages(folio); void *old; @@ -144,7 +144,7 @@ void __delete_from_swap_cache(struct folio *folio, struct address_space *address_space = swap_address_space(entry); int i; long nr = folio_nr_pages(folio); - pgoff_t idx = swp_offset(entry); + pgoff_t idx = swap_cache_index(entry); XA_STATE(xas, &address_space->i_pages, idx); xas_set_update(&xas, workingset_update_node); @@ -253,13 +253,14 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, for (;;) { swp_entry_t entry = swp_entry(type, curr); + unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; struct address_space *address_space = swap_address_space(entry); - XA_STATE(xas, &address_space->i_pages, curr); + XA_STATE(xas, &address_space->i_pages, index); xas_set_update(&xas, workingset_update_node); xa_lock_irq(&address_space->i_pages); - xas_for_each(&xas, old, end) { + xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { if (!xa_is_value(old)) continue; xas_store(&xas, NULL); @@ -350,7 +351,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, { struct folio *folio; - folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (!IS_ERR(folio)) { bool vma_ra = swap_use_vma_readahead(); bool readahead; @@ -420,7 +421,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, si = get_swap_device(swp); if (!si) return ERR_PTR(-ENOENT); - index = swp_offset(swp); + index = swap_cache_index(swp); folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); return folio; @@ -447,7 +448,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * that would confuse statistics. */ folio = filemap_get_folio(swap_address_space(entry), - swp_offset(entry)); + swap_cache_index(entry)); if (!IS_ERR(folio)) goto got_folio; diff --git a/mm/swapfile.c b/mm/swapfile.c index 348c4d3ae078ae..f1e559e216bdc8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -142,7 +142,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, struct folio *folio; int ret = 0; - folio = filemap_get_folio(swap_address_space(entry), offset); + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (IS_ERR(folio)) return 0; /* @@ -2158,7 +2158,7 @@ static int try_to_unuse(unsigned int type) (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); - folio = filemap_get_folio(swap_address_space(entry), i); + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (IS_ERR(folio)) continue; @@ -3451,7 +3451,7 @@ EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __folio_swap_cache_index(struct folio *folio) { - return swp_offset(folio->swap); + return swap_cache_index(folio->swap); } EXPORT_SYMBOL_GPL(__folio_swap_cache_index); -- cgit 1.2.3-korg From 3d95bc21cea558c7cdb2942b4d0223a571e93f27 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:06 +0800 Subject: mm: shmem: add THP validation for PMD-mapped THP related statistics In order to extend support for mTHP, add THP validation for PMD-mapped THP related statistics to avoid statistical confusion. Link: https://lkml.kernel.org/r/c4b04cbd51e6951cc2436a87be8eaa4a1516faec.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/shmem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index 3e7d60d71d3124..bf22909f1261ca 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1647,7 +1647,7 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, return ERR_PTR(-E2BIG); folio = shmem_alloc_folio(gfp, HPAGE_PMD_ORDER, info, index); - if (!folio) + if (!folio && pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); } else { pages = 1; @@ -1665,7 +1665,7 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, if (xa_find(&mapping->i_pages, &index, index + pages - 1, XA_PRESENT)) { error = -EEXIST; - } else if (huge) { + } else if (pages == HPAGE_PMD_NR) { count_vm_event(THP_FILE_FALLBACK); count_vm_event(THP_FILE_FALLBACK_CHARGE); } @@ -2031,7 +2031,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, folio = shmem_alloc_and_add_folio(huge_gfp, inode, index, fault_mm, true); if (!IS_ERR(folio)) { - count_vm_event(THP_FILE_ALLOC); + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_FILE_ALLOC); goto alloced; } if (PTR_ERR(folio) == -EEXIST) -- cgit 1.2.3-korg From 4b98995530b77a97912230d8e1564ba7738db19c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:07 +0800 Subject: mm: shmem: add multi-size THP sysfs interface for anonymous shmem To support the use of mTHP with anonymous shmem, add a new sysfs interface 'shmem_enabled' in the '/sys/kernel/mm/transparent_hugepage/hugepages-kB/' directory for each mTHP to control whether shmem is enabled for that mTHP, with a value similar to the top level 'shmem_enabled', which can be set to: "always", "inherit (to inherit the top level setting)", "within_size", "advise", "never". An 'inherit' option is added to ensure compatibility with these global settings, and the options 'force' and 'deny' are dropped, which are rather testing artifacts from the old ages. By default, PMD-sized hugepages have enabled="inherit" and all other hugepage sizes have enabled="never" for '/sys/kernel/mm/transparent_hugepage/hugepages-xxkB/shmem_enabled'. In addition, if top level value is 'force', then only PMD-sized hugepages have enabled="inherit", otherwise configuration will be failed and vice versa. That means now we will avoid using non-PMD sized THP to override the global huge allocation. [baolin.wang@linux.alibaba.com: fix transhuge.rst indentation] Link: https://lkml.kernel.org/r/b189d815-998b-4dfd-ba89-218ff51313f8@linux.alibaba.com [akpm@linux-foundation.org: reflow transhuge.rst addition to 80 cols] [baolin.wang@linux.alibaba.com: move huge_shmem_orders_lock under CONFIG_SYSFS] Link: https://lkml.kernel.org/r/eb34da66-7f12-44f3-a39e-2bcc90c33354@linux.alibaba.com [akpm@linux-foundation.org: huge_memory.c needs mm_types.h] Link: https://lkml.kernel.org/r/ffddfa8b3cb4266ff963099ab78cfd7184c57ac7.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 25 ++++++++ include/linux/huge_mm.h | 10 ++++ mm/huge_memory.c | 12 ++-- mm/shmem.c | 96 ++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 8 deletions(-) (limited to 'mm/shmem.c') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index d414d3f5592a8f..e7232b46fe1412 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -332,6 +332,31 @@ deny force Force the huge option on for all - very useful for testing; +Shmem can also use "multi-size THP" (mTHP) by adding a new sysfs knob to +control mTHP allocation: +'/sys/kernel/mm/transparent_hugepage/hugepages-kB/shmem_enabled', +and its value for each mTHP is essentially consistent with the global +setting. An 'inherit' option is added to ensure compatibility with these +global settings. Conversely, the options 'force' and 'deny' are dropped, +which are rather testing artifacts from the old ages. + +always + Attempt to allocate huge pages every time we need a new page; + +inherit + Inherit the top-level "shmem_enabled" value. By default, PMD-sized hugepages + have enabled="inherit" and all other hugepage sizes have enabled="never"; + +never + Do not allocate huge pages; + +within_size + Only allocate huge page if it will be fully within i_size. + Also respect fadvise()/madvise() hints; + +advise + Only allocate huge pages if requested with fadvise()/madvise(); + Need of application restart =========================== diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 088d66a5464376..6a1edd75296880 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,6 +6,7 @@ #include #include /* only for vma_is_dax() */ +#include vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -63,6 +64,7 @@ ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; +extern struct kobj_attribute thpsize_shmem_enabled_attr; /* * Mask of all large folio orders supported for anonymous THP; all orders up to @@ -265,6 +267,14 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } +struct thpsize { + struct kobject kobj; + struct list_head node; + int order; +}; + +#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) + enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cff47b3fc72ef6..af6f239b0f97a5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -449,14 +450,6 @@ static void thpsize_release(struct kobject *kobj); static DEFINE_SPINLOCK(huge_anon_orders_lock); static LIST_HEAD(thpsize_list); -struct thpsize { - struct kobject kobj; - struct list_head node; - int order; -}; - -#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) - static ssize_t thpsize_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -517,6 +510,9 @@ static struct kobj_attribute thpsize_enabled_attr = static struct attribute *thpsize_attrs[] = { &thpsize_enabled_attr.attr, +#ifdef CONFIG_SHMEM + &thpsize_shmem_enabled_attr.attr, +#endif NULL, }; diff --git a/mm/shmem.c b/mm/shmem.c index bf22909f1261ca..d80608f9d6affb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -131,6 +131,13 @@ struct shmem_options { #define SHMEM_SEEN_QUOTA 32 }; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long huge_shmem_orders_always __read_mostly; +static unsigned long huge_shmem_orders_madvise __read_mostly; +static unsigned long huge_shmem_orders_inherit __read_mostly; +static unsigned long huge_shmem_orders_within_size __read_mostly; +#endif + #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { @@ -4672,6 +4679,12 @@ void __init shmem_init(void) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ + + /* + * Default to setting PMD-sized THP to inherit the global setting and + * disable all other multi-size THPs. + */ + huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return; @@ -4731,6 +4744,11 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) return -EINVAL; + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (huge == SHMEM_HUGE_FORCE && + huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) + return -EINVAL; + shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; @@ -4738,6 +4756,84 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, } struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); +static DEFINE_SPINLOCK(huge_shmem_orders_lock); + +static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_shmem_orders_always)) + output = "[always] inherit within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_inherit)) + output = "always [inherit] within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_within_size)) + output = "always inherit [within_size] advise never"; + else if (test_bit(order, &huge_shmem_orders_madvise)) + output = "always inherit within_size [advise] never"; + else + output = "always inherit within_size advise [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_always); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "inherit")) { + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (shmem_huge == SHMEM_HUGE_FORCE && + order != HPAGE_PMD_ORDER) + return -EINVAL; + + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_inherit); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "within_size")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + set_bit(order, &huge_shmem_orders_within_size); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "madvise")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "never")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + clear_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else { + ret = -EINVAL; + } + + return ret; +} + +struct kobj_attribute thpsize_shmem_enabled_attr = + __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ #else /* !CONFIG_SHMEM */ -- cgit 1.2.3-korg From e7a2ab7b3bb5d87f99f2ea3d4481d52fc5ceb52d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:08 +0800 Subject: mm: shmem: add mTHP support for anonymous shmem Commit 19eaf44954df adds multi-size THP (mTHP) for anonymous pages, that can allow THP to be configured through the sysfs interface located at '/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled'. However, the anonymous shmem will ignore the anonymous mTHP rule configured through the sysfs interface, and can only use the PMD-mapped THP, that is not reasonable. Users expect to apply the mTHP rule for all anonymous pages, including the anonymous shmem, in order to enjoy the benefits of mTHP. For example, lower latency than PMD-mapped THP, smaller memory bloat than PMD-mapped THP, contiguous PTEs on ARM architecture to reduce TLB miss etc. In addition, the mTHP interfaces can be extended to support all shmem/tmpfs scenarios in the future, especially for the shmem mmap() case. The primary strategy is similar to supporting anonymous mTHP. Introduce a new interface '/mm/transparent_hugepage/hugepage-XXkb/shmem_enabled', which can have almost the same values as the top-level '/sys/kernel/mm/transparent_hugepage/shmem_enabled', with adding a new additional "inherit" option and dropping the testing options 'force' and 'deny'. By default all sizes will be set to "never" except PMD size, which is set to "inherit". This ensures backward compatibility with the anonymous shmem enabled of the top level, meanwhile also allows independent control of anonymous shmem enabled for each mTHP. Link: https://lkml.kernel.org/r/65796c1e72e51e15f3410195b5c2d5b6c160d411.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 10 +++ mm/shmem.c | 187 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 167 insertions(+), 30 deletions(-) (limited to 'mm/shmem.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 6a1edd75296880..53b7a137f46032 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -560,6 +560,16 @@ static inline bool thp_migration_supported(void) { return false; } + +static inline int highest_order(unsigned long orders) +{ + return 0; +} + +static inline int next_order(unsigned long *orders, int prev) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/mm/shmem.c b/mm/shmem.c index d80608f9d6affb..8225b2f230ebb6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1610,6 +1610,107 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) return result; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge) +{ + unsigned long mask = READ_ONCE(huge_shmem_orders_always); + unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); + unsigned long vm_flags = vma->vm_flags; + /* + * Check all the (large) orders below HPAGE_PMD_ORDER + 1 that + * are enabled for this vma. + */ + unsigned long orders = BIT(PMD_ORDER + 1) - 1; + loff_t i_size; + int order; + + if ((vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return 0; + + /* If the hardware/firmware marked hugepage support disabled. */ + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) + return 0; + + /* + * Following the 'deny' semantics of the top level, force the huge + * option off from all mounts. + */ + if (shmem_huge == SHMEM_HUGE_DENY) + return 0; + + /* + * Only allow inherit orders if the top-level value is 'force', which + * means non-PMD sized THP can not override 'huge' mount option now. + */ + if (shmem_huge == SHMEM_HUGE_FORCE) + return READ_ONCE(huge_shmem_orders_inherit); + + /* Allow mTHP that will be fully within i_size. */ + order = highest_order(within_size_orders); + while (within_size_orders) { + index = round_up(index + 1, order); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= index) { + mask |= within_size_orders; + break; + } + + order = next_order(&within_size_orders, order); + } + + if (vm_flags & VM_HUGEPAGE) + mask |= READ_ONCE(huge_shmem_orders_madvise); + + if (global_huge) + mask |= READ_ONCE(huge_shmem_orders_inherit); + + return orders & mask; +} + +static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, + struct address_space *mapping, pgoff_t index, + unsigned long orders) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long pages; + int order; + + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + if (!orders) + return 0; + + /* Find the highest order that can add into the page cache */ + order = highest_order(orders); + while (orders) { + pages = 1UL << order; + index = round_down(index, pages); + if (!xa_find(&mapping->i_pages, &index, + index + pages - 1, XA_PRESENT)) + break; + order = next_order(&orders, order); + } + + return orders; +} +#else +static unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge) +{ + return 0; +} + +static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, + struct address_space *mapping, pgoff_t index, + unsigned long orders) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { @@ -1624,38 +1725,55 @@ static struct folio *shmem_alloc_folio(gfp_t gfp, int order, return folio; } -static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, - struct inode *inode, pgoff_t index, - struct mm_struct *fault_mm, bool huge) +static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, + gfp_t gfp, struct inode *inode, pgoff_t index, + struct mm_struct *fault_mm, unsigned long orders) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - struct folio *folio; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + unsigned long suitable_orders = 0; + struct folio *folio = NULL; long pages; - int error; + int error, order; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - huge = false; + orders = 0; - if (huge) { - pages = HPAGE_PMD_NR; - index = round_down(index, HPAGE_PMD_NR); + if (orders > 0) { + if (vma && vma_is_anon_shmem(vma)) { + suitable_orders = shmem_suitable_orders(inode, vmf, + mapping, index, orders); + } else if (orders & BIT(HPAGE_PMD_ORDER)) { + pages = HPAGE_PMD_NR; + suitable_orders = BIT(HPAGE_PMD_ORDER); + index = round_down(index, HPAGE_PMD_NR); - /* - * Check for conflict before waiting on a huge allocation. - * Conflict might be that a huge page has just been allocated - * and added to page cache by a racing thread, or that there - * is already at least one small page in the huge extent. - * Be careful to retry when appropriate, but not forever! - * Elsewhere -EEXIST would be the right code, but not here. - */ - if (xa_find(&mapping->i_pages, &index, - index + HPAGE_PMD_NR - 1, XA_PRESENT)) - return ERR_PTR(-E2BIG); + /* + * Check for conflict before waiting on a huge allocation. + * Conflict might be that a huge page has just been allocated + * and added to page cache by a racing thread, or that there + * is already at least one small page in the huge extent. + * Be careful to retry when appropriate, but not forever! + * Elsewhere -EEXIST would be the right code, but not here. + */ + if (xa_find(&mapping->i_pages, &index, + index + HPAGE_PMD_NR - 1, XA_PRESENT)) + return ERR_PTR(-E2BIG); + } - folio = shmem_alloc_folio(gfp, HPAGE_PMD_ORDER, info, index); - if (!folio && pages == HPAGE_PMD_NR) - count_vm_event(THP_FILE_FALLBACK); + order = highest_order(suitable_orders); + while (suitable_orders) { + pages = 1UL << order; + index = round_down(index, pages); + folio = shmem_alloc_folio(gfp, order, info, index); + if (folio) + goto allocated; + + if (pages == HPAGE_PMD_NR) + count_vm_event(THP_FILE_FALLBACK); + order = next_order(&suitable_orders, order); + } } else { pages = 1; folio = shmem_alloc_folio(gfp, 0, info, index); @@ -1663,6 +1781,7 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, if (!folio) return ERR_PTR(-ENOMEM); +allocated: __folio_set_locked(folio); __folio_set_swapbacked(folio); @@ -1957,7 +2076,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct mm_struct *fault_mm; struct folio *folio; int error; - bool alloced; + bool alloced, huge; + unsigned long orders = 0; if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) return -EINVAL; @@ -2029,14 +2149,21 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return 0; } - if (shmem_is_huge(inode, index, false, fault_mm, - vma ? vma->vm_flags : 0)) { + huge = shmem_is_huge(inode, index, false, fault_mm, + vma ? vma->vm_flags : 0); + /* Find hugepage orders that are allowed for anonymous shmem. */ + if (vma && vma_is_anon_shmem(vma)) + orders = shmem_allowable_huge_orders(inode, vma, index, huge); + else if (huge) + orders = BIT(HPAGE_PMD_ORDER); + + if (orders > 0) { gfp_t huge_gfp; huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); - folio = shmem_alloc_and_add_folio(huge_gfp, - inode, index, fault_mm, true); + folio = shmem_alloc_and_add_folio(vmf, huge_gfp, + inode, index, fault_mm, orders); if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); @@ -2046,7 +2173,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, goto repeat; } - folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false); + folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0); if (IS_ERR(folio)) { error = PTR_ERR(folio); if (error == -EEXIST) @@ -2057,7 +2184,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, alloced: alloced = true; - if (folio_test_pmd_mappable(folio) && + if (folio_test_large(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < folio_next_index(folio) - 1) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); -- cgit 1.2.3-korg From 5a9dd10380a16b343aa87d80d5bcc24409a03f5b Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:09 +0800 Subject: mm: shmem: add mTHP size alignment in shmem_get_unmapped_area Although the top-level hugepage allocation can be turned off, anonymous shmem can still use mTHP by configuring the sysfs interface located at '/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/shmem_enabled'. Therefore, add alignment for mTHP size to provide a suitable alignment address in shmem_get_unmapped_area(). Link: https://lkml.kernel.org/r/0c549b57cf7db07503af692d8546ecfad0fcce52.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Tested-by: Lance Yang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/shmem.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index 8225b2f230ebb6..224162aa204bcd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2393,6 +2393,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long inflated_len; unsigned long inflated_addr; unsigned long inflated_offset; + unsigned long hpage_size; if (len > TASK_SIZE) return -ENOMEM; @@ -2411,8 +2412,6 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (shmem_huge == SHMEM_HUGE_DENY) return addr; - if (len < HPAGE_PMD_SIZE) - return addr; if (flags & MAP_FIXED) return addr; /* @@ -2424,8 +2423,11 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (uaddr == addr) return addr; + hpage_size = HPAGE_PMD_SIZE; if (shmem_huge != SHMEM_HUGE_FORCE) { struct super_block *sb; + unsigned long __maybe_unused hpage_orders; + int order = 0; if (file) { VM_BUG_ON(file->f_op != &shmem_file_operations); @@ -2438,18 +2440,38 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (IS_ERR(shm_mnt)) return addr; sb = shm_mnt->mnt_sb; + + /* + * Find the highest mTHP order used for anonymous shmem to + * provide a suitable alignment address. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + hpage_orders = READ_ONCE(huge_shmem_orders_always); + hpage_orders |= READ_ONCE(huge_shmem_orders_within_size); + hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); + if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) + hpage_orders |= READ_ONCE(huge_shmem_orders_inherit); + + if (hpage_orders > 0) { + order = highest_order(hpage_orders); + hpage_size = PAGE_SIZE << order; + } +#endif } - if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) + if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) return addr; } - offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); - if (offset && offset + len < 2 * HPAGE_PMD_SIZE) + if (len < hpage_size) + return addr; + + offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1); + if (offset && offset + len < 2 * hpage_size) return addr; - if ((addr & (HPAGE_PMD_SIZE-1)) == offset) + if ((addr & (hpage_size - 1)) == offset) return addr; - inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; + inflated_len = len + hpage_size - PAGE_SIZE; if (inflated_len > TASK_SIZE) return addr; if (inflated_len < len) @@ -2462,10 +2484,10 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_addr & ~PAGE_MASK) return addr; - inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); + inflated_offset = inflated_addr & (hpage_size - 1); inflated_addr += offset - inflated_offset; if (inflated_offset > offset) - inflated_addr += HPAGE_PMD_SIZE; + inflated_addr += hpage_size; if (inflated_addr > TASK_SIZE - len) return addr; -- cgit 1.2.3-korg From 66f44583f9b617d74ffa2487e75a9c3adf344ddb Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:10 +0800 Subject: mm: shmem: add mTHP counters for anonymous shmem Add mTHP counters for anonymous shmem. [baolin.wang@linux.alibaba.com: update Documentation/admin-guide/mm/transhuge.rst] Link: https://lkml.kernel.org/r/d86e2e7f-4141-432b-b2ba-c6691f36ef0b@linux.alibaba.com Link: https://lkml.kernel.org/r/4fd9e467d49ae4a747e428bcd821c7d13125ae67.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Lance Yang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 13 +++++++++++++ include/linux/huge_mm.h | 3 +++ mm/huge_memory.c | 6 ++++++ mm/shmem.c | 18 +++++++++++++++--- 4 files changed, 37 insertions(+), 3 deletions(-) (limited to 'mm/shmem.c') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index e7232b46fe1412..1f72b00af5d359 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -501,6 +501,19 @@ swpout_fallback Usually because failed to allocate some continuous swap space for the huge page. +file_alloc + is incremented every time a file huge page is successfully + allocated. + +file_fallback + is incremented if a file huge page is attempted to be allocated + but fails and instead falls back to using small pages. + +file_fallback_charge + is incremented if a file huge page cannot be charged and instead + falls back to using small pages even though the allocation was + successful. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 53b7a137f46032..7ad41de5eaea40 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -281,6 +281,9 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, + MTHP_STAT_FILE_ALLOC, + MTHP_STAT_FILE_FALLBACK, + MTHP_STAT_FILE_FALLBACK_CHARGE, __MTHP_STAT_COUNT }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index af6f239b0f97a5..f55362a73e2d1d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -556,6 +556,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); +DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC); +DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK); +DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE); static struct attribute *stats_attrs[] = { &anon_fault_alloc_attr.attr, @@ -563,6 +566,9 @@ static struct attribute *stats_attrs[] = { &anon_fault_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, + &file_alloc_attr.attr, + &file_fallback_attr.attr, + &file_fallback_charge_attr.attr, NULL, }; diff --git a/mm/shmem.c b/mm/shmem.c index 224162aa204bcd..d09c6bf1f28a66 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1772,6 +1772,9 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK); +#endif order = next_order(&suitable_orders, order); } } else { @@ -1791,9 +1794,15 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, if (xa_find(&mapping->i_pages, &index, index + pages - 1, XA_PRESENT)) { error = -EEXIST; - } else if (pages == HPAGE_PMD_NR) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); + } else if (pages > 1) { + if (pages == HPAGE_PMD_NR) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_FALLBACK); + count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_FALLBACK_CHARGE); +#endif } goto unlock; } @@ -2167,6 +2176,9 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_ALLOC); +#endif goto alloced; } if (PTR_ERR(folio) == -EEXIST) -- cgit 1.2.3-korg From 843a2e24c24c5311831860c6b78ceacdd4627000 Mon Sep 17 00:00:00 2001 From: Bang Li Date: Fri, 28 Jun 2024 11:23:27 +0800 Subject: mm/shmem: fix input and output inconsistencies Commit 19eaf44954df ("mm: thp: support allocation of anonymous multi-size THP") added mTHP support for anonymous shmem. We can configure different policies through the multi-size THP sysfs interface for anonymous shmem. But when we configure the "advise" policy of /sys/kernel/mm/transparent_hugepage/hugepages-xxxkB/shmem_enabled, we cannot write the "advise", but write the "madvise", which is unreasonable. We should keep the output and input values consistent, which is more convenient for users. Link: https://lkml.kernel.org/r/20240628032327.16987-1-libang.li@antgroup.com Fixes: 61a57f1b1da9 ("mm: shmem: add multi-size THP sysfs interface for anonymous shmem") Signed-off-by: Bang Li Reviewed-by: Baolin Wang Cc: Bang Li Cc: David Hildenbrand Cc: Hugh Dickins Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index 426b09d8cc5534..f24177e9d5cc57 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4983,7 +4983,7 @@ static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, clear_bit(order, &huge_shmem_orders_madvise); set_bit(order, &huge_shmem_orders_within_size); spin_unlock(&huge_shmem_orders_lock); - } else if (sysfs_streq(buf, "madvise")) { + } else if (sysfs_streq(buf, "advise")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); -- cgit 1.2.3-korg From 26c7d8413aaf113a54b54f63e151416a5c5c2a88 Mon Sep 17 00:00:00 2001 From: Bang Li Date: Fri, 5 Jul 2024 11:23:09 +0800 Subject: mm: thp: support "THPeligible" semantics for mTHP with anonymous shmem After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for anonymous shmem"), we can configure different policies through the multi-size THP sysfs interface for anonymous shmem. But currently "THPeligible" indicates only whether the mapping is eligible for allocating THP-pages as well as the THP is PMD mappable or not for anonymous shmem, we need to support semantics for mTHP with anonymous shmem similar to those for mTHP with anonymous memory. Link: https://lkml.kernel.org/r/20240705032309.24933-1-libang.li@antgroup.com Signed-off-by: Bang Li Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 9 +++++++++ mm/huge_memory.c | 13 +++++++++---- mm/shmem.c | 9 +-------- 3 files changed, 19 insertions(+), 12 deletions(-) (limited to 'mm/shmem.c') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 3fb18f7eb73eaf..1d06b1e5408a53 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -113,12 +113,21 @@ int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags); +unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge); #else static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags) { return false; } +static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge) +{ + return 0; +} #endif #ifdef CONFIG_SHMEM diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f8b5cbd4dd71a3..9ec64aa2be9484 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -151,10 +151,15 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * Must be done before hugepage flags check since shmem has its * own flags. */ - if (!in_pf && shmem_file(vma->vm_file)) - return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, - !enforce_sysfs, vma->vm_mm, vm_flags) - ? orders : 0; + if (!in_pf && shmem_file(vma->vm_file)) { + bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, + !enforce_sysfs, vma->vm_mm, vm_flags); + + if (!vma_is_anon_shmem(vma)) + return global_huge ? orders : 0; + return shmem_allowable_huge_orders(file_inode(vma->vm_file), + vma, vma->vm_pgoff, global_huge); + } if (!vma_is_anonymous(vma)) { /* diff --git a/mm/shmem.c b/mm/shmem.c index f24177e9d5cc57..921d59c3d6690a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1622,7 +1622,7 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static unsigned long shmem_allowable_huge_orders(struct inode *inode, +unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, bool global_huge) { @@ -1707,13 +1707,6 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault return orders; } #else -static unsigned long shmem_allowable_huge_orders(struct inode *inode, - struct vm_area_struct *vma, pgoff_t index, - bool global_huge) -{ - return 0; -} - static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, struct address_space *mapping, pgoff_t index, unsigned long orders) -- cgit 1.2.3-korg From 63d9866ab01ffd0d0835d5564107283a4afc0a38 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 10 Jul 2024 10:55:01 +0100 Subject: mm: shmem: rename mTHP shmem counters The legacy PMD-sized THP counters at /proc/vmstat include thp_file_alloc, thp_file_fallback and thp_file_fallback_charge, which rather confusingly refer to shmem THP and do not include any other types of file pages. This is inconsistent since in most other places in the kernel, THP counters are explicitly separated for anon, shmem and file flavours. However, we are stuck with it since it constitutes a user ABI. Recently, commit 66f44583f9b6 ("mm: shmem: add mTHP counters for anonymous shmem") added equivalent mTHP stats for shmem, keeping the same "file_" prefix in the names. But in future, we may want to add extra stats to cover actual file pages, at which point, it would all become very confusing. So let's take the opportunity to rename these new counters "shmem_" before the change makes it upstream and the ABI becomes immutable. While we are at it, let's improve the documentation for the legacy counters to make it clear that they count shmem pages only. Link: https://lkml.kernel.org/r/20240710095503.3193901-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Baolin Wang Reviewed-by: Lance Yang Reviewed-by: Zi Yan Reviewed-by: Barry Song Acked-by: David Hildenbrand Cc: Daniel Gomez Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 29 ++++++++++++++++------------- include/linux/huge_mm.h | 6 +++--- mm/huge_memory.c | 12 ++++++------ mm/shmem.c | 8 ++++---- 4 files changed, 29 insertions(+), 26 deletions(-) (limited to 'mm/shmem.c') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index fe237825b95c62..058485daf1860d 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -412,20 +412,23 @@ thp_collapse_alloc_failed the allocation. thp_file_alloc - is incremented every time a file huge page is successfully - allocated. + is incremented every time a shmem huge page is successfully + allocated (Note that despite being named after "file", the counter + measures only shmem). thp_file_fallback - is incremented if a file huge page is attempted to be allocated - but fails and instead falls back to using small pages. + is incremented if a shmem huge page is attempted to be allocated + but fails and instead falls back to using small pages. (Note that + despite being named after "file", the counter measures only shmem). thp_file_fallback_charge - is incremented if a file huge page cannot be charged and instead + is incremented if a shmem huge page cannot be charged and instead falls back to using small pages even though the allocation was - successful. + successful. (Note that despite being named after "file", the + counter measures only shmem). thp_file_mapped - is incremented every time a file huge page is mapped into + is incremented every time a file or shmem huge page is mapped into user address space. thp_split_page @@ -496,16 +499,16 @@ swpout_fallback Usually because failed to allocate some continuous swap space for the huge page. -file_alloc - is incremented every time a file huge page is successfully +shmem_alloc + is incremented every time a shmem huge page is successfully allocated. -file_fallback - is incremented if a file huge page is attempted to be allocated +shmem_fallback + is incremented if a shmem huge page is attempted to be allocated but fails and instead falls back to using small pages. -file_fallback_charge - is incremented if a file huge page cannot be charged and instead +shmem_fallback_charge + is incremented if a shmem huge page cannot be charged and instead falls back to using small pages even though the allocation was successful. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index acb6ac24a07e6f..cff002be83eb91 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -269,9 +269,9 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, - MTHP_STAT_FILE_ALLOC, - MTHP_STAT_FILE_FALLBACK, - MTHP_STAT_FILE_FALLBACK_CHARGE, + MTHP_STAT_SHMEM_ALLOC, + MTHP_STAT_SHMEM_FALLBACK, + MTHP_STAT_SHMEM_FALLBACK_CHARGE, MTHP_STAT_SPLIT, MTHP_STAT_SPLIT_FAILED, MTHP_STAT_SPLIT_DEFERRED, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9ec64aa2be9484..f9696c94e2110b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -568,9 +568,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); -DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC); -DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK); -DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE); +DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC); +DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK); +DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); @@ -581,9 +581,9 @@ static struct attribute *stats_attrs[] = { &anon_fault_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, - &file_alloc_attr.attr, - &file_fallback_attr.attr, - &file_fallback_charge_attr.attr, + &shmem_alloc_attr.attr, + &shmem_fallback_attr.attr, + &shmem_fallback_charge_attr.attr, &split_attr.attr, &split_failed_attr.attr, &split_deferred_attr.attr, diff --git a/mm/shmem.c b/mm/shmem.c index 921d59c3d6690a..f24dfbd387ba3d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1777,7 +1777,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK); + count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); #endif order = next_order(&suitable_orders, order); } @@ -1804,8 +1804,8 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, count_vm_event(THP_FILE_FALLBACK_CHARGE); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_FALLBACK); - count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_FALLBACK_CHARGE); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); #endif } goto unlock; @@ -2181,7 +2181,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_ALLOC); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); #endif goto alloced; } -- cgit 1.2.3-korg