From e976936cfc66376fc740a3a476365273384ce1ce Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 16 Dec 2022 14:45:37 -0500 Subject: mm/mempolicy: do not duplicate policy if it is not applicable for set_mempolicy_home_node set_mempolicy_home_node tries to duplicate a memory policy before checking it whether it is applicable for the operation. There is no real reason for doing that and it might actually be a pointless memory allocation and deallocation exercise for MPOL_INTERLEAVE. Not a big problem but we can do better. Simply check the policy before acting on it. Link: https://lkml.kernel.org/r/20221216194537.238047-2-mathieu.desnoyers@efficios.com Signed-off-by: Michal Hocko Reviewed-by: Mathieu Desnoyers Signed-off-by: Mathieu Desnoyers Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Signed-off-by: Andrew Morton --- mm/mempolicy.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 02c8a712282f11..becf41e100762c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1489,7 +1489,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - struct mempolicy *new; + struct mempolicy *new, *old; unsigned long vmstart; unsigned long vmend; unsigned long end; @@ -1521,31 +1521,27 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le return 0; mmap_write_lock(mm); for_each_vma_range(vmi, vma, end) { - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); - new = mpol_dup(vma_policy(vma)); - if (IS_ERR(new)) { - err = PTR_ERR(new); - break; - } - /* - * Only update home node if there is an existing vma policy - */ - if (!new) - continue; - /* * If any vma in the range got policy other than MPOL_BIND * or MPOL_PREFERRED_MANY we return error. We don't reset * the home node for vmas we already updated before. */ - if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { - mpol_put(new); + old = vma_policy(vma); + if (!old) + continue; + if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { err = -EOPNOTSUPP; break; } + new = mpol_dup(old); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } new->home_node = home_node; + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); err = mbind_range(mm, vmstart, vmend, new); mpol_put(new); if (err) -- cgit 1.2.3-korg From 1ef488edd6c4d447784710974f049628c2890481 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Dec 2022 16:56:16 +0100 Subject: mm/mprotect: drop pgprot_t parameter from change_protection() Being able to provide a custom protection opens the door for inconsistencies and BUGs: for example, accidentally allowing for more permissions than desired by other mechanisms (e.g., softdirty tracking). vma->vm_page_prot should be the single source of truth. Only PROT_NUMA is special: there is no way we can erroneously allow for more permissions when removing all permissions. Special-case using the MM_CP_PROT_NUMA flag. [david@redhat.com: PAGE_NONE might not be defined without CONFIG_NUMA_BALANCING] Link: https://lkml.kernel.org/r/5084ff1c-ebb3-f918-6a60-bacabf550a88@redhat.com Link: https://lkml.kernel.org/r/20221223155616.297723-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Nadav Amit Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +-- mm/mempolicy.c | 3 +-- mm/mprotect.c | 18 +++++++++++++++--- mm/userfaultfd.c | 3 +-- 4 files changed, 18 insertions(+), 9 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index d68579bf84848f..329ed67edd7644 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2134,8 +2134,7 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot, - unsigned long cp_flags); + unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index becf41e100762c..d3558248a0f0dc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -635,8 +635,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, tlb_gather_mmu(&tlb, vma->vm_mm); - nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE, - MM_CP_PROT_NUMA); + nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); diff --git a/mm/mprotect.c b/mm/mprotect.c index bf8fa0af5a159d..71358e45a742f3 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -507,13 +507,25 @@ static unsigned long change_protection_range(struct mmu_gather *tlb, unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot, - unsigned long cp_flags) + unsigned long end, unsigned long cp_flags) { + pgprot_t newprot = vma->vm_page_prot; unsigned long pages; BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); +#ifdef CONFIG_NUMA_BALANCING + /* + * Ordinary protection updates (mprotect, uffd-wp, softdirty tracking) + * are expected to reflect their requirements via VMA flags such that + * vma_set_page_prot() will adjust vma->vm_page_prot accordingly. + */ + if (cp_flags & MM_CP_PROT_NUMA) + newprot = PAGE_NONE; +#else + WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA); +#endif + if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot, cp_flags); @@ -642,7 +654,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); - change_protection(tlb, vma, start, end, vma->vm_page_prot, mm_cp_flags); + change_protection(tlb, vma, start, end, mm_cp_flags); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 46771362550fee..65ad172add276c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -730,8 +730,7 @@ void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, vma->vm_page_prot, - mm_cp_flags); + change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); tlb_finish_mmu(&tlb); } -- cgit 1.2.3-korg From a79390f5d6a78647fd70856bd42b22d994de0ba2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Jan 2023 17:52:06 -0500 Subject: mm/mprotect: use long for page accountings and retval Switch to use type "long" for page accountings and retval across the whole procedure of change_protection(). The change should have shrinked the possible maximum page number to be half comparing to previous (ULONG_MAX / 2), but it shouldn't overflow on any system either because the maximum possible pages touched by change protection should be ULONG_MAX / PAGE_SIZE. Two reasons to switch from "unsigned long" to "long": 1. It suites better on count_vm_numa_events(), whose 2nd parameter takes a long type. 2. It paves way for returning negative (error) values in the future. Currently the only caller that consumes this retval is change_prot_numa(), where the unsigned long was converted to an int. Since at it, touching up the numa code to also take a long, so it'll avoid any possible overflow too during the int-size convertion. Link: https://lkml.kernel.org/r/20230104225207.1066932-3-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: Mike Kravetz Acked-by: James Houghton Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Muchun Song Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- include/linux/mm.h | 2 +- mm/hugetlb.c | 4 ++-- mm/mempolicy.c | 2 +- mm/mprotect.c | 26 +++++++++++++------------- 5 files changed, 19 insertions(+), 19 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b6b10101bea725..e3aa336df900ea 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -248,7 +248,7 @@ void hugetlb_vma_lock_release(struct kref *kref); int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); -unsigned long hugetlb_change_protection(struct vm_area_struct *vma, +long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); @@ -437,7 +437,7 @@ static inline void move_hugetlb_state(struct folio *old_folio, { } -static inline unsigned long hugetlb_change_protection( +static inline long hugetlb_change_protection( struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) diff --git a/include/linux/mm.h b/include/linux/mm.h index 329ed67edd7644..4ac5ea4b584c92 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2132,7 +2132,7 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma } bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); -extern unsigned long change_protection(struct mmu_gather *tlb, +extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a82f410241671d..a0d6d09800649b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6615,7 +6615,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i ? i : err; } -unsigned long hugetlb_change_protection(struct vm_area_struct *vma, +long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { @@ -6624,7 +6624,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); - unsigned long pages = 0, psize = huge_page_size(h); + long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; unsigned long last_addr_mask; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d3558248a0f0dc..a86b8f15e2f097 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -631,7 +631,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mmu_gather tlb; - int nr_updated; + long nr_updated; tlb_gather_mmu(&tlb, vma->vm_mm); diff --git a/mm/mprotect.c b/mm/mprotect.c index 71358e45a742f3..0af22ab59ea80e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -80,13 +80,13 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, return pte_dirty(pte); } -static unsigned long change_pte_range(struct mmu_gather *tlb, +static long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pte_t *pte, oldpte; spinlock_t *ptl; - unsigned long pages = 0; + long pages = 0; int target_node = NUMA_NO_NODE; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; @@ -353,13 +353,13 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) } \ } while (0) -static inline unsigned long change_pmd_range(struct mmu_gather *tlb, +static inline long change_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pmd_t *pmd; unsigned long next; - unsigned long pages = 0; + long pages = 0; unsigned long nr_huge_updates = 0; struct mmu_notifier_range range; @@ -367,7 +367,7 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { - unsigned long this_pages; + long this_pages; next = pmd_addr_end(addr, end); @@ -437,13 +437,13 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb, return pages; } -static inline unsigned long change_pud_range(struct mmu_gather *tlb, +static inline long change_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pud_t *pud; unsigned long next; - unsigned long pages = 0; + long pages = 0; pud = pud_offset(p4d, addr); do { @@ -458,13 +458,13 @@ static inline unsigned long change_pud_range(struct mmu_gather *tlb, return pages; } -static inline unsigned long change_p4d_range(struct mmu_gather *tlb, +static inline long change_p4d_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { p4d_t *p4d; unsigned long next; - unsigned long pages = 0; + long pages = 0; p4d = p4d_offset(pgd, addr); do { @@ -479,14 +479,14 @@ static inline unsigned long change_p4d_range(struct mmu_gather *tlb, return pages; } -static unsigned long change_protection_range(struct mmu_gather *tlb, +static long change_protection_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; - unsigned long pages = 0; + long pages = 0; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); @@ -505,12 +505,12 @@ static unsigned long change_protection_range(struct mmu_gather *tlb, return pages; } -unsigned long change_protection(struct mmu_gather *tlb, +long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags) { pgprot_t newprot = vma->vm_page_prot; - unsigned long pages; + long pages; BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); -- cgit 1.2.3-korg From d1751118c88673fe5a948ad82277898e9e284c55 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Jan 2023 17:52:07 -0500 Subject: mm/uffd: detect pgtable allocation failures Before this patch, when there's any pgtable allocation issues happened during change_protection(), the error will be ignored from the syscall. For shmem, there will be an error dumped into the host dmesg. Two issues with that: (1) Doing a trace dump when allocation fails is not anything close to grace. (2) The user should be notified with any kind of such error, so the user can trap it and decide what to do next, either by retrying, or stop the process properly, or anything else. For userfault users, this will change the API of UFFDIO_WRITEPROTECT when pgtable allocation failure happened. It should not normally break anyone, though. If it breaks, then in good ways. One man-page update will be on the way to introduce the new -ENOMEM for UFFDIO_WRITEPROTECT. Not marking stable so we keep the old behavior on the 5.19-till-now kernels. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20230104225207.1066932-4-peterx@redhat.com Signed-off-by: Peter Xu Reported-by: James Houghton Acked-by: James Houghton Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 2 +- mm/hugetlb.c | 6 +++-- mm/mempolicy.c | 2 +- mm/mprotect.c | 63 ++++++++++++++++++++++++++++--------------- mm/userfaultfd.c | 16 +++++++---- 5 files changed, 59 insertions(+), 30 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 9df0b9a762cc93..3767f18114ef9f 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -73,7 +73,7 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); -extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, +extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* mm helpers */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a0d6d09800649b..6fe65f14d33b0b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6660,8 +6660,10 @@ long hugetlb_change_protection(struct vm_area_struct *vma, * pre-allocations to install pte markers. */ ptep = huge_pte_alloc(mm, vma, address, psize); - if (!ptep) + if (!ptep) { + pages = -ENOMEM; break; + } } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, address, ptep)) { @@ -6751,7 +6753,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); - return pages << h->order; + return pages > 0 ? (pages << h->order) : pages; } /* Return true if reservation was successful, false otherwise. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a86b8f15e2f097..85a34f1f3ab8f8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -636,7 +636,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); - if (nr_updated) + if (nr_updated > 0) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); tlb_finish_mmu(&tlb); diff --git a/mm/mprotect.c b/mm/mprotect.c index 0af22ab59ea80e..92fc6f3fa5125d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -330,28 +330,34 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) /* * If wr-protecting the range for file-backed, populate pgtable for the case * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc() - * failed it means no memory, we don't have a better option but stop. + * failed we treat it the same way as pgtable allocation failures during + * page faults by kicking OOM and returning error. */ #define change_pmd_prepare(vma, pmd, cp_flags) \ - do { \ + ({ \ + long err = 0; \ if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ - if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd))) \ - break; \ + if (pte_alloc(vma->vm_mm, pmd)) \ + err = -ENOMEM; \ } \ - } while (0) + err; \ + }) + /* * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to * have separate change_pmd_prepare() because pte_alloc() returns 0 on success, * while {pmd|pud|p4d}_alloc() returns the valid pointer on success. */ #define change_prepare(vma, high, low, addr, cp_flags) \ - do { \ + ({ \ + long err = 0; \ if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ low##_t *p = low##_alloc(vma->vm_mm, high, addr); \ - if (WARN_ON_ONCE(p == NULL)) \ - break; \ + if (p == NULL) \ + err = -ENOMEM; \ } \ - } while (0) + err; \ + }) static inline long change_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, @@ -367,11 +373,15 @@ static inline long change_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { - long this_pages; + long ret; next = pmd_addr_end(addr, end); - change_pmd_prepare(vma, pmd, cp_flags); + ret = change_pmd_prepare(vma, pmd, cp_flags); + if (ret) { + pages = ret; + break; + } /* * Automatic NUMA balancing walks the tables with mmap_lock * held for read. It's possible a parallel update to occur @@ -401,7 +411,11 @@ static inline long change_pmd_range(struct mmu_gather *tlb, * cleared; make sure pmd populated if * necessary, then fall-through to pte level. */ - change_pmd_prepare(vma, pmd, cp_flags); + ret = change_pmd_prepare(vma, pmd, cp_flags); + if (ret) { + pages = ret; + break; + } } else { /* * change_huge_pmd() does not defer TLB flushes, @@ -422,9 +436,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb, } /* fall through, the trans huge pmd just split */ } - this_pages = change_pte_range(tlb, vma, pmd, addr, next, - newprot, cp_flags); - pages += this_pages; + pages += change_pte_range(tlb, vma, pmd, addr, next, + newprot, cp_flags); next: cond_resched(); } while (pmd++, addr = next, addr != end); @@ -443,12 +456,14 @@ static inline long change_pud_range(struct mmu_gather *tlb, { pud_t *pud; unsigned long next; - long pages = 0; + long pages = 0, ret; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - change_prepare(vma, pud, pmd, addr, cp_flags); + ret = change_prepare(vma, pud, pmd, addr, cp_flags); + if (ret) + return ret; if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, @@ -464,12 +479,14 @@ static inline long change_p4d_range(struct mmu_gather *tlb, { p4d_t *p4d; unsigned long next; - long pages = 0; + long pages = 0, ret; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - change_prepare(vma, p4d, pud, addr, cp_flags); + ret = change_prepare(vma, p4d, pud, addr, cp_flags); + if (ret) + return ret; if (p4d_none_or_clear_bad(p4d)) continue; pages += change_pud_range(tlb, vma, p4d, addr, next, newprot, @@ -486,14 +503,18 @@ static long change_protection_range(struct mmu_gather *tlb, struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; - long pages = 0; + long pages = 0, ret; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); tlb_start_vma(tlb, vma); do { next = pgd_addr_end(addr, end); - change_prepare(vma, pgd, p4d, addr, cp_flags); + ret = change_prepare(vma, pgd, p4d, addr, cp_flags); + if (ret) { + pages = ret; + break; + } if (pgd_none_or_clear_bad(pgd)) continue; pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 65ad172add276c..53c3d916ff661e 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -710,11 +710,12 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, mmap_changing, 0); } -void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, +long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long start, unsigned long len, bool enable_wp) { unsigned int mm_cp_flags; struct mmu_gather tlb; + long ret; if (enable_wp) mm_cp_flags = MM_CP_UFFD_WP; @@ -730,8 +731,10 @@ void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); + ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); tlb_finish_mmu(&tlb); + + return ret; } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, @@ -740,7 +743,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, { struct vm_area_struct *dst_vma; unsigned long page_mask; - int err; + long err; /* * Sanitize the command parameters: @@ -779,9 +782,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, goto out_unlock; } - uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); + err = uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); + + /* Return 0 on success, <0 on failures */ + if (err > 0) + err = 0; - err = 0; out_unlock: mmap_read_unlock(dst_mm); return err; -- cgit 1.2.3-korg From f10c2abcdac4a44795fae9118eaedfe56204afda Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:21 -0500 Subject: mempolicy: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-21-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mempolicy.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72142fbe765280..ed68bdf980d3b3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -787,24 +787,21 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - prev = mas_prev(&mas, 0); - if (unlikely(!prev)) - mas_set(&mas, start); - - vma = mas_find(&mas, end - 1); + prev = vma_prev(&vmi); + vma = vma_find(&vmi, end); if (WARN_ON(!vma)) return 0; if (start > vma->vm_start) prev = vma; - for (; vma; vma = mas_next(&mas, end - 1)) { + do { unsigned long vmstart = max(start, vma->vm_start); unsigned long vmend = min(end, vma->vm_end); @@ -813,29 +810,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, + prev = vmi_vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (prev) { - /* vma_merge() invalidated the mas */ - mas_pause(&mas); vma = prev; goto replace; } if (vma->vm_start != vmstart) { - err = split_vma(vma->vm_mm, vma, vmstart, 1); + err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmstart, 1); if (err) goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } if (vma->vm_end != vmend) { - err = split_vma(vma->vm_mm, vma, vmend, 0); + err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmend, 0); if (err) goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } replace: err = vma_replace_policy(vma, new_pol); @@ -843,7 +834,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, goto out; next: prev = vma; - } + } for_each_vma_range(vmi, vma, end); out: return err; -- cgit 1.2.3-korg From 9760ebffbf5507320e0de41f5b80089bdef996a0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:30 -0500 Subject: mm: switch vma_merge(), split_vma(), and __split_vma to vma iterator Drop the vmi_* functions and transition all users to use the vma iterator directly. Link: https://lkml.kernel.org/r/20230120162650.984577-30-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 14 +++++----- include/linux/mm.h | 18 ++++--------- mm/madvise.c | 6 ++--- mm/mempolicy.c | 6 ++--- mm/mlock.c | 6 ++--- mm/mmap.c | 79 +++++++++++++++--------------------------------------- mm/mprotect.c | 6 ++--- mm/mremap.c | 10 +++---- mm/nommu.c | 8 +++--- 9 files changed, 55 insertions(+), 98 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4334bd35984d6c..f3c75c6222de6d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -909,7 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, + prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), @@ -1452,7 +1452,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, + prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), @@ -1463,12 +1463,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto next; } if (vma->vm_start < start) { - ret = vmi_split_vma(&vmi, mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret) break; } if (vma->vm_end > end) { - ret = vmi_split_vma(&vmi, mm, vma, end, 0); + ret = split_vma(&vmi, vma, end, 0); if (ret) break; } @@ -1632,7 +1632,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(mm, vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, + prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); @@ -1641,12 +1641,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto next; } if (vma->vm_start < start) { - ret = vmi_split_vma(&vmi, mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret) break; } if (vma->vm_end > end) { - ret = vmi_split_vma(&vmi, mm, vma, end, 0); + ret = split_vma(&vmi, vma, end, 0); if (ret) break; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 144ddfd6599227..f3b49feb5c3588 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2839,24 +2839,16 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, { return __vma_adjust(vma, start, end, pgoff, insert, NULL); } -extern struct vm_area_struct *vma_merge(struct mm_struct *, - struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); -extern struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, +extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int __split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); -extern int split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); +extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *, + unsigned long addr, int new_below); +extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *, + unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, diff --git a/mm/madvise.c b/mm/madvise.c index 4d4471916465ff..02b317726c9aa6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -150,7 +150,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vmi_vma_merge(&vmi, mm, *prev, start, end, new_flags, + *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_name); if (*prev) { @@ -163,7 +163,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (start != vma->vm_start) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = vmi__split_vma(&vmi, mm, vma, start, 1); + error = __split_vma(&vmi, vma, start, 1); if (error) return error; } @@ -171,7 +171,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (end != vma->vm_end) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = vmi__split_vma(&vmi, mm, vma, end, 0); + error = __split_vma(&vmi, vma, end, 0); if (error) return error; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ed68bdf980d3b3..dd5ca942256f30 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -810,7 +810,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vmi_vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, + prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); @@ -819,12 +819,12 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, goto replace; } if (vma->vm_start != vmstart) { - err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmstart, 1); + err = split_vma(&vmi, vma, vmstart, 1); if (err) goto out; } if (vma->vm_end != vmend) { - err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmend, 0); + err = split_vma(&vmi, vma, vmend, 0); if (err) goto out; } diff --git a/mm/mlock.c b/mm/mlock.c index 0d09b9070071c0..0336f52e03d752 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -418,7 +418,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, goto out; pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vmi_vma_merge(vmi, mm, *prev, start, end, newflags, + *prev = vma_merge(vmi, mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { @@ -427,13 +427,13 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, } if (start != vma->vm_start) { - ret = vmi_split_vma(vmi, mm, vma, start, 1); + ret = split_vma(vmi, vma, start, 1); if (ret) goto out; } if (end != vma->vm_end) { - ret = vmi_split_vma(vmi, mm, vma, end, 0); + ret = split_vma(vmi, vma, end, 0); if (ret) goto out; } diff --git a/mm/mmap.c b/mm/mmap.c index 8806bfbaa5055c..afc65f122f7dab 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1010,7 +1010,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * parameter) may establish ptes with the wrong permissions of NNNN * instead of the right permissions of XXXX. */ -struct vm_area_struct *vma_merge(struct mm_struct *mm, +struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, @@ -1019,7 +1019,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *mid, *next, *res; + struct vm_area_struct *mid, *next, *res = NULL; int err = -1; bool merge_prev = false; bool merge_next = false; @@ -1085,26 +1085,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (err) return NULL; khugepaged_enter_vma(res, vm_flags); - return res; -} -struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, - struct mm_struct *mm, - struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) -{ - struct vm_area_struct *tmp; - - tmp = vma_merge(mm, prev, addr, end, vm_flags, anon_vma, file, pgoff, - policy, vm_userfaultfd_ctx, anon_name); - if (tmp) + if (res) vma_iter_set(vmi, end); - return tmp; + return res; } /* @@ -2228,12 +2213,14 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. */ -int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; int err; - validate_mm_mt(mm); + unsigned long end = vma->vm_end; + + validate_mm_mt(vma->vm_mm); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); @@ -2273,8 +2260,10 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); /* Success. */ - if (!err) + if (!err) { + vma_iter_set(vmi, end); return 0; + } /* Avoid vm accounting in close() operation */ new->vm_start = new->vm_end; @@ -2289,46 +2278,21 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, mpol_put(vma_policy(new)); out_free_vma: vm_area_free(new); - validate_mm_mt(mm); + validate_mm_mt(vma->vm_mm); return err; } -int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, int new_below) -{ - int ret; - unsigned long end = vma->vm_end; - - ret = __split_vma(mm, vma, addr, new_below); - if (!ret) - vma_iter_set(vmi, end); - - return ret; -} /* * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ -int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { - if (mm->map_count >= sysctl_max_map_count) + if (vma->vm_mm->map_count >= sysctl_max_map_count) return -ENOMEM; - return __split_vma(mm, vma, addr, new_below); -} - -int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, int new_below) -{ - int ret; - unsigned long end = vma->vm_end; - - ret = split_vma(mm, vma, addr, new_below); - if (!ret) - vma_iter_set(vmi, end); - - return ret; + return __split_vma(vmi, vma, addr, new_below); } static inline int munmap_sidetree(struct vm_area_struct *vma, @@ -2388,7 +2352,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - error = vmi__split_vma(vmi, mm, vma, start, 0); + error = __split_vma(vmi, vma, start, 0); if (error) goto start_split_failed; @@ -2409,7 +2373,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (next->vm_end > end) { struct vm_area_struct *split; - error = vmi__split_vma(vmi, mm, next, end, 1); + error = __split_vma(vmi, next, end, 1); if (error) goto end_split_failed; @@ -2690,9 +2654,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, - vma->vm_end, vma->vm_flags, NULL, vma->vm_file, - vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + merge = vma_merge(&vmi, mm, prev, vma->vm_start, + vma->vm_end, vma->vm_flags, NULL, + vma->vm_file, vma->vm_pgoff, NULL, + NULL_VM_UFFD_CTX, NULL); if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -3249,7 +3214,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vmi_vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, + new_vma = vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 39b6335b8813b7..cce6a0e58fb507 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -642,7 +642,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * First try to merge with previous and/or next vma. */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vmi_vma_merge(vmi, mm, *pprev, start, end, newflags, + *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { @@ -654,13 +654,13 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, *pprev = vma; if (start != vma->vm_start) { - error = vmi_split_vma(vmi, mm, vma, start, 1); + error = split_vma(vmi, vma, start, 1); if (error) goto fail; } if (end != vma->vm_end) { - error = vmi_split_vma(vmi, mm, vma, end, 0); + error = split_vma(vmi, vma, end, 0); if (error) goto fail; } diff --git a/mm/mremap.c b/mm/mremap.c index f161516ab3c1e6..71ba8eddd836be 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1043,12 +1043,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * when a vma would be actually removed due to a merge. */ if (!vma->vm_ops || !vma->vm_ops->close) { - vma = vmi_vma_merge(&vmi, mm, vma, - extension_start, extension_end, - vma->vm_flags, vma->anon_vma, - vma->vm_file, extension_pgoff, - vma_policy(vma), vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); + vma = vma_merge(&vmi, mm, vma, extension_start, + extension_end, vma->vm_flags, vma->anon_vma, + vma->vm_file, extension_pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } else if (vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; diff --git a/mm/nommu.c b/mm/nommu.c index 9ddeb92600d6d5..9a166738909ea7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1297,18 +1297,20 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. */ -int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, int new_below) +int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vm_area_struct *new; struct vm_region *region; unsigned long npages; + struct mm_struct *mm; /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ if (vma->vm_file) return -ENOMEM; + mm = vma->vm_mm; if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; @@ -1465,7 +1467,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; if (start != vma->vm_start && end != vma->vm_end) { - ret = vmi_split_vma(&vmi, mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret < 0) return ret; } -- cgit 1.2.3-korg From 6aa3a920125e9f58891e2b5dc2efd4d0c1ff05a6 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:50 -0600 Subject: mm/hugetlb: convert isolate_hugetlb to folios Patch series "continue hugetlb folio conversion", v3. This series continues the conversion of core hugetlb functions to use folios. This series converts many helper funtions in the hugetlb fault path. This is in preparation for another series to convert the hugetlb fault code paths to operate on folios. This patch (of 8): Convert isolate_hugetlb() to take in a folio and convert its callers to pass a folio. Use page_folio() to convert the callers to use a folio is safe as isolate_hugetlb() operates on a head page. Link: https://lkml.kernel.org/r/20230113223057.173292-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20230113223057.173292-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/gup.c | 2 +- mm/hugetlb.c | 16 ++++++++-------- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a51e6daacac643..6e38a019f654b5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,7 +171,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int isolate_hugetlb(struct page *page, struct list_head *list); +int isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -413,7 +413,7 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline int isolate_hugetlb(struct page *page, struct list_head *list) +static inline int isolate_hugetlb(struct folio *folio, struct list_head *list) { return -EBUSY; } diff --git a/mm/gup.c b/mm/gup.c index 25e4a3d923d6e9..b0885f70579c5a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1930,7 +1930,7 @@ static unsigned long collect_longterm_unpinnable_pages( continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(&folio->page, movable_page_list); + isolate_hugetlb(folio, movable_page_list); continue; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab35b1cc99277d..0c1e1ce113c84c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2925,7 +2925,7 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(&old_folio->page, list); + ret = isolate_hugetlb(old_folio, list); spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { @@ -3000,7 +3000,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list)) + if (folio_ref_count(folio) && !isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -7250,19 +7250,19 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -int isolate_hugetlb(struct page *page, struct list_head *list) +int isolate_hugetlb(struct folio *folio, struct list_head *list) { int ret = 0; spin_lock_irq(&hugetlb_lock); - if (!PageHeadHuge(page) || - !HPageMigratable(page) || - !get_page_unless_zero(page)) { + if (!folio_test_hugetlb(folio) || + !folio_test_hugetlb_migratable(folio) || + !folio_try_get(folio)) { ret = -EBUSY; goto unlock; } - ClearHPageMigratable(page); - list_move_tail(&page->lru, list); + folio_clear_hugetlb_migratable(folio); + list_move_tail(&folio->lru, list); unlock: spin_unlock_irq(&hugetlb_lock); return ret; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b4b30d9b078272..db85c2d37f70ac 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2508,7 +2508,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool isolated = false; if (PageHuge(page)) { - isolated = !isolate_hugetlb(page, pagelist); + isolated = !isolate_hugetlb(page_folio(page), pagelist); } else { bool lru = !__PageMovable(page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd40f7e9f17635..a1e8c3e9ab0808 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1641,7 +1641,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_hugetlb(head, &source); + isolate_hugetlb(folio, &source); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index dd5ca942256f30..fc034b07064584 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -602,7 +602,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(page, qp->pagelist) && + if (isolate_hugetlb(page_folio(page), qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate page but allow migrating pages diff --git a/mm/migrate.c b/mm/migrate.c index 206fcdbe67f3d2..f6464bce767803 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1773,7 +1773,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - err = isolate_hugetlb(page, pagelist); + err = isolate_hugetlb(page_folio(page), pagelist); if (!err) err = 1; } -- cgit 1.2.3-korg From d0ce0e47b323a8d7fb5dc3314ce56afa650ade2d Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:33 -0800 Subject: mm/hugetlb: convert hugetlb fault paths to use alloc_hugetlb_folio() Change alloc_huge_page() to alloc_hugetlb_folio() by changing all callers to handle the now folio return type of the function. In this conversion, alloc_huge_page_vma() is also changed to alloc_hugetlb_folio_vma() and hugepage_add_new_anon_rmap() is changed to take in a folio directly. Many additions of '&folio->page' are cleaned up in subsequent patches. hugetlbfs_fallocate() is also refactored to use the RCU + page_cache_next_miss() API. Link: https://lkml.kernel.org/r/20230125170537.96973-5-sidhartha.kumar@oracle.com Suggested-by: Mike Kravetz Reported-by: kernel test robot Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 40 +++++----- include/linux/hugetlb.h | 8 +- include/linux/rmap.h | 2 +- mm/hugetlb.c | 201 ++++++++++++++++++++++++------------------------ mm/mempolicy.c | 6 +- mm/rmap.c | 6 +- 6 files changed, 133 insertions(+), 130 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 44ecdcb796cce3..f89e12106e8a9a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -819,8 +819,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here. */ - struct page *page; + struct folio *folio; unsigned long addr; + bool present; cond_resched(); @@ -844,48 +845,49 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - page = find_get_page(mapping, index); - if (page) { - put_page(page); + rcu_read_lock(); + present = page_cache_next_miss(mapping, index, 1) != index; + rcu_read_unlock(); + if (present) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); hugetlb_drop_vma_policy(&pseudo_vma); continue; } /* - * Allocate page without setting the avoid_reserve argument. + * Allocate folio without setting the avoid_reserve argument. * There certainly are no reserves associated with the * pseudo_vma. However, there could be shared mappings with * reserves for the file at the inode level. If we fallocate - * pages in these areas, we need to consume the reserves + * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - page = alloc_huge_page(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); - if (IS_ERR(page)) { + if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - error = PTR_ERR(page); + error = PTR_ERR(folio); goto out; } - clear_huge_page(page, addr, pages_per_huge_page(h)); - __SetPageUptodate(page); - error = hugetlb_add_to_page_cache(page, mapping, index); + clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + error = hugetlb_add_to_page_cache(&folio->page, mapping, index); if (unlikely(error)) { - restore_reserve_on_error(h, &pseudo_vma, addr, page); - put_page(page); + restore_reserve_on_error(h, &pseudo_vma, addr, &folio->page); + folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; } mutex_unlock(&hugetlb_fault_mutex_table[hash]); - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); /* - * unlock_page because locked by hugetlb_add_to_page_cache() - * put_page() due to reference from alloc_huge_page() + * folio_unlock because locked by hugetlb_add_to_page_cache() + * folio_put() due to reference from alloc_hugetlb_folio() */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 067906c5778e7b..6408f85e575451 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -717,11 +717,11 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -1033,7 +1033,7 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } -static inline struct page *alloc_huge_page(struct vm_area_struct *vma, +static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -1047,7 +1047,7 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return NULL; } -static inline struct page *alloc_huge_page_vma(struct hstate *h, +static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index a6bd1f0a183d91..a4570da03e58aa 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -203,7 +203,7 @@ void page_remove_rmap(struct page *, struct vm_area_struct *, void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, +void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); static inline void __page_dup_rmap(struct page *page, bool compound) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fd1ce61b8f3f29..ea8d4611779bad 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2493,7 +2493,7 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, } /* mempolicy aware migration callback */ -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct mempolicy *mpol; @@ -2507,7 +2507,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); mpol_cond_put(mpol); - return &folio->page; + return folio; } /* @@ -2798,14 +2798,14 @@ static long vma_del_reservation(struct hstate *h, /* * This routine is called to restore reservation information on error paths. - * It should ONLY be called for pages allocated via alloc_huge_page(), and - * the hugetlb mutex should remain held when calling this routine. + * It should ONLY be called for folios allocated via alloc_hugetlb_folio(), + * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: * 1) A reservation was in place and the page consumed the reservation. * HPageRestoreReserve is set in the page. * 2) No reservation was in place for the page, so HPageRestoreReserve is - * not set. However, alloc_huge_page always updates the reserve map. + * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_page later in the error path will increment the * global reserve count. But, free_huge_page does not have enough context @@ -2814,7 +2814,7 @@ static long vma_del_reservation(struct hstate *h, * reserve count adjustments to be made by free_huge_page. Make sure the * reserve map indicates there is a reservation present. * - * In case 2, simply undo reserve map modifications done by alloc_huge_page. + * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page) @@ -2844,8 +2844,8 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, if (!rc) { /* * This indicates there is an entry in the reserve map - * not added by alloc_huge_page. We know it was added - * before the alloc_huge_page call, otherwise + * not added by alloc_hugetlb_folio. We know it was added + * before the alloc_hugetlb_folio call, otherwise * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. @@ -3014,7 +3014,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct hugepage_subpool *spool = subpool_vma(vma); @@ -3023,7 +3023,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, long map_chg, map_commit; long gbl_chg; int ret, idx; - struct hugetlb_cgroup *h_cg; + struct hugetlb_cgroup *h_cg = NULL; bool deferred_reserve; idx = hstate_index(h); @@ -3130,7 +3130,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } - return &folio->page; + return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); @@ -4950,7 +4950,7 @@ hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long add struct folio *new_folio) { __folio_mark_uptodate(new_folio); - hugepage_add_new_anon_rmap(&new_folio->page, vma, addr); + hugepage_add_new_anon_rmap(new_folio, vma, addr); set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); folio_set_hugetlb_migratable(new_folio); @@ -5080,34 +5080,34 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } else if (page_try_dup_anon_rmap(ptepage, true, src_vma)) { pte_t src_pte_old = entry; - struct page *new; + struct folio *new_folio; spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new = alloc_huge_page(dst_vma, addr, 1); - if (IS_ERR(new)) { + new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + if (IS_ERR(new_folio)) { put_page(ptepage); - ret = PTR_ERR(new); + ret = PTR_ERR(new_folio); break; } - copy_user_huge_page(new, ptepage, addr, dst_vma, + copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma, npages); put_page(ptepage); - /* Install the new huge page if src pte stable */ + /* Install the new hugetlb folio if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, - new); - put_page(new); + &new_folio->page); + folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; } - hugetlb_install_folio(dst_vma, dst_pte, addr, page_folio(new)); + hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; @@ -5478,7 +5478,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte; struct hstate *h = hstate_vma(vma); - struct page *old_page, *new_page; + struct page *old_page; + struct folio *new_folio; int outside_reserve = 0; vm_fault_t ret = 0; unsigned long haddr = address & huge_page_mask(h); @@ -5539,9 +5540,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * be acquired again before returning to the caller, as expected. */ spin_unlock(ptl); - new_page = alloc_huge_page(vma, haddr, outside_reserve); + new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); - if (IS_ERR(new_page)) { + if (IS_ERR(new_folio)) { /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient @@ -5586,7 +5587,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } - ret = vmf_error(PTR_ERR(new_page)); + ret = vmf_error(PTR_ERR(new_folio)); goto out_release_old; } @@ -5599,9 +5600,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, goto out_release_all; } - copy_user_huge_page(new_page, old_page, address, vma, + copy_user_huge_page(&new_folio->page, old_page, address, vma, pages_per_huge_page(h)); - __SetPageUptodate(new_page); + __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + huge_page_size(h)); @@ -5618,12 +5619,12 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); page_remove_rmap(old_page, vma, true); - hugepage_add_new_anon_rmap(new_page, vma, haddr); + hugepage_add_new_anon_rmap(new_folio, vma, haddr); set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, !unshare)); - SetHPageMigratable(new_page); + make_huge_pte(vma, &new_folio->page, !unshare)); + folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ - new_page = old_page; + new_folio = page_folio(old_page); } spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); @@ -5632,9 +5633,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * No restore in case of successful pagetable update (Break COW or * unshare) */ - if (new_page != old_page) - restore_reserve_on_error(h, vma, haddr, new_page); - put_page(new_page); + if (new_folio != page_folio(old_page)) + restore_reserve_on_error(h, vma, haddr, &new_folio->page); + folio_put(new_folio); out_release_old: put_page(old_page); @@ -5753,11 +5754,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; - struct page *page; + struct folio *folio; pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); - bool new_page, new_pagecache_page = false; + bool new_folio, new_pagecache_folio = false; u32 hash = hugetlb_fault_mutex_hash(mapping, idx); /* @@ -5776,9 +5777,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - new_page = false; - page = find_lock_page(mapping, idx); - if (!page) { + new_folio = false; + folio = filemap_lock_folio(mapping, idx); + if (!folio) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; @@ -5811,8 +5812,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, VM_UFFD_MISSING); } - page = alloc_huge_page(vma, haddr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(vma, haddr, 0); + if (IS_ERR(folio)) { /* * Returning error will result in faulting task being * sent SIGBUS. The hugetlb fault mutex prevents two @@ -5826,17 +5827,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * sure there really is no pte entry. */ if (hugetlb_pte_stable(h, mm, ptep, old_pte)) - ret = vmf_error(PTR_ERR(page)); + ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } - clear_huge_page(page, address, pages_per_huge_page(h)); - __SetPageUptodate(page); - new_page = true; + clear_huge_page(&folio->page, address, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(page, mapping, idx); + int err = hugetlb_add_to_page_cache(&folio->page, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -5845,13 +5846,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, page); - put_page(page); + restore_reserve_on_error(h, vma, haddr, &folio->page); + folio_put(folio); goto out; } - new_pagecache_page = true; + new_pagecache_folio = true; } else { - lock_page(page); + folio_lock(folio); if (unlikely(anon_vma_prepare(vma))) { ret = VM_FAULT_OOM; goto backout_unlocked; @@ -5864,7 +5865,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * don't have hwpoisoned swap entry for errored virtual address. * So we need to block hugepage fault by PG_hwpoison bit check. */ - if (unlikely(PageHWPoison(page))) { + if (unlikely(folio_test_hwpoison(folio))) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; @@ -5872,8 +5873,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, /* Check for page in userfault range. */ if (userfaultfd_minor(vma)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* See comment in userfaultfd_missing() block above */ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { ret = 0; @@ -5907,10 +5908,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto backout; if (anon_rmap) - hugepage_add_new_anon_rmap(page, vma, haddr); + hugepage_add_new_anon_rmap(folio, vma, haddr); else - page_dup_file_rmap(page, true); - new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + page_dup_file_rmap(&folio->page, true); + new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); /* * If this pte was previously wr-protected, keep it wr-protected even @@ -5923,20 +5924,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, &folio->page, ptl); } spin_unlock(ptl); /* - * Only set HPageMigratable in newly allocated pages. Existing pages - * found in the pagecache may not have HPageMigratableset if they have + * Only set hugetlb_migratable in newly allocated pages. Existing pages + * found in the pagecache may not have hugetlb_migratable if they have * been isolated for migration. */ - if (new_page) - SetHPageMigratable(page); + if (new_folio) + folio_set_hugetlb_migratable(folio); - unlock_page(page); + folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -5945,11 +5946,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, backout: spin_unlock(ptl); backout_unlocked: - if (new_page && !new_pagecache_page) - restore_reserve_on_error(h, vma, haddr, page); + if (new_folio && !new_pagecache_folio) + restore_reserve_on_error(h, vma, haddr, &folio->page); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } @@ -6173,16 +6174,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t _dst_pte; spinlock_t *ptl; int ret = -ENOMEM; - struct page *page; + struct folio *folio; int writable; - bool page_in_pagecache = false; + bool folio_in_pagecache = false; if (is_continue) { ret = -EFAULT; - page = find_lock_page(mapping, idx); - if (!page) + folio = filemap_lock_folio(mapping, idx); + if (!folio) goto out; - page_in_pagecache = true; + folio_in_pagecache = true; } else if (!*pagep) { /* If a page already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. @@ -6193,34 +6194,34 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } - ret = copy_huge_page_from_user(page, + ret = copy_huge_page_from_user(&folio->page, (const void __user *) src_addr, pages_per_huge_page(h), false); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; - /* Free the allocated page which may have + /* Free the allocated folio which may have * consumed a reservation. */ - restore_reserve_on_error(h, dst_vma, dst_addr, page); - put_page(page); + restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + folio_put(folio); - /* Allocate a temporary page to hold the copied + /* Allocate a temporary folio to hold the copied * contents. */ - page = alloc_huge_page_vma(h, dst_vma, dst_addr); - if (!page) { + folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr); + if (!folio) { ret = -ENOMEM; goto out; } - *pagep = page; + *pagep = &folio->page; /* Set the outparam pagep and return to the caller to * copy the contents outside the lock. Don't free the * page. @@ -6236,25 +6237,25 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + if (IS_ERR(folio)) { put_page(*pagep); ret = -ENOMEM; *pagep = NULL; goto out; } - copy_user_huge_page(page, *pagep, dst_addr, dst_vma, + copy_user_huge_page(&folio->page, *pagep, dst_addr, dst_vma, pages_per_huge_page(h)); put_page(*pagep); *pagep = NULL; } /* - * The memory barrier inside __SetPageUptodate makes sure that + * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ - __SetPageUptodate(page); + __folio_mark_uptodate(folio); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { @@ -6269,16 +6270,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = hugetlb_add_to_page_cache(page, mapping, idx); + ret = hugetlb_add_to_page_cache(&folio->page, mapping, idx); if (ret) goto out_release_nounlock; - page_in_pagecache = true; + folio_in_pagecache = true; } ptl = huge_pte_lock(h, dst_mm, dst_pte); ret = -EIO; - if (PageHWPoison(page)) + if (folio_test_hwpoison(folio)) goto out_release_unlock; /* @@ -6290,10 +6291,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; - if (page_in_pagecache) - page_dup_file_rmap(page, true); + if (folio_in_pagecache) + page_dup_file_rmap(&folio->page, true); else - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr); /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY @@ -6304,7 +6305,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, else writable = dst_vma->vm_flags & VM_WRITE; - _dst_pte = make_huge_pte(dst_vma, page, writable); + _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -6326,20 +6327,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, spin_unlock(ptl); if (!is_continue) - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); if (vm_shared || is_continue) - unlock_page(page); + folio_unlock(folio); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); if (vm_shared || is_continue) - unlock_page(page); + folio_unlock(folio); out_release_nounlock: - if (!page_in_pagecache) - restore_reserve_on_error(h, dst_vma, dst_addr, page); - put_page(page); + if (!folio_in_pagecache) + restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + folio_put(folio); goto out; } #endif /* CONFIG_USERFAULTFD */ @@ -6871,7 +6872,7 @@ bool hugetlb_reserve_pages(struct inode *inode, /* * pages in this range were added to the reserve * map between region_chg and region_add. This - * indicates a race with alloc_huge_page. Adjust + * indicates a race with alloc_hugetlb_folio. Adjust * the subpool and reserve counts modified above * based on the difference. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fc034b07064584..7686f40c97500e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1210,9 +1210,11 @@ static struct page *new_page(struct page *page, unsigned long start) break; } - if (folio_test_hugetlb(src)) - return alloc_huge_page_vma(page_hstate(&src->page), + if (folio_test_hugetlb(src)) { + dst = alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); + return &dst->page; + } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; diff --git a/mm/rmap.c b/mm/rmap.c index 86fccc2b9fc94d..8287f2cc327d31 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2534,15 +2534,13 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, !!(flags & RMAP_EXCLUSIVE)); } -void hugepage_add_new_anon_rmap(struct page *page, +void hugepage_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { - struct folio *folio = page_folio(page); - BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); - __page_set_anon_rmap(folio, page, vma, address, 1); + __page_set_anon_rmap(folio, &folio->page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ -- cgit 1.2.3-korg From de1f5055523e9a035b38533f25a56df03d45034a Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:29 -0800 Subject: mm/mempolicy: convert queue_pages_pmd() to queue_folios_pmd() The function now operates on a folio instead of the page associated with a pmd. This change is in preparation for the conversion of queue_pages_required() to queue_folio_required() and migrate_page_add() to migrate_folio_add(). Link: https://lkml.kernel.org/r/20230130201833.27042-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7686f40c97500e..fc754dbcbbcd62 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -442,21 +442,21 @@ static inline bool queue_pages_required(struct page *page, } /* - * queue_pages_pmd() has three possible return values: - * 0 - pages are placed on the right node or queued successfully, or + * queue_folios_pmd() has three possible return values: + * 0 - folios are placed on the right node or queued successfully, or * special page is met, i.e. huge zero page. - * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were + * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an - * existing page was already on a node that does not follow the + * existing folio was already on a node that does not follow the * policy. */ -static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, +static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long end, struct mm_walk *walk) __releases(ptl) { int ret = 0; - struct page *page; + struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags; @@ -464,19 +464,19 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, ret = -EIO; goto unlock; } - page = pmd_page(*pmd); - if (is_huge_zero_page(page)) { + folio = pfn_folio(pmd_pfn(*pmd)); + if (is_huge_zero_page(&folio->page)) { walk->action = ACTION_CONTINUE; goto unlock; } - if (!queue_pages_required(page, qp)) + if (!queue_pages_required(&folio->page, qp)) goto unlock; flags = qp->flags; - /* go to thp migration */ + /* go to folio migration */ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { if (!vma_migratable(walk->vma) || - migrate_page_add(page, qp->pagelist, flags)) { + migrate_page_add(&folio->page, qp->pagelist, flags)) { ret = 1; goto unlock; } @@ -512,7 +512,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) - return queue_pages_pmd(pmd, ptl, addr, end, walk); + return queue_folios_pmd(pmd, ptl, addr, end, walk); if (pmd_trans_unstable(pmd)) return 0; -- cgit 1.2.3-korg From 3dae02bbd07f40e37bbfec2d77119628db461eaa Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:30 -0800 Subject: mm/mempolicy: convert queue_pages_pte_range() to queue_folios_pte_range() This function now operates on folios associated with ptes instead of pages. This change is in preparation for the conversion of queue_pages_required() to queue_folio_required() and migrate_page_add() to migrate_folio_add(). Link: https://lkml.kernel.org/r/20230130201833.27042-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fc754dbcbbcd62..b0805bb87655f5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -491,19 +491,19 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, * Scan through pages checking if pages follow certain conditions, * and move them to the pagelist if they do. * - * queue_pages_pte_range() has three possible return values: - * 0 - pages are placed on the right node or queued successfully, or + * queue_folios_pte_range() has three possible return values: + * 0 - folios are placed on the right node or queued successfully, or * special page is met, i.e. zero page. - * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were + * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. - * -EIO - only MPOL_MF_STRICT was specified and an existing page was already + * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already * on a node that does not follow the policy. */ -static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, +static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; - struct page *page; + struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; bool has_unmovable = false; @@ -521,16 +521,16 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; - page = vm_normal_page(vma, addr, *pte); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, *pte); + if (!folio || folio_is_zone_device(folio)) continue; /* - * vm_normal_page() filters out zero pages, but there might - * still be PageReserved pages to skip, perhaps in a VDSO. + * vm_normal_folio() filters out zero pages, but there might + * still be reserved folios to skip, perhaps in a VDSO. */ - if (PageReserved(page)) + if (folio_test_reserved(folio)) continue; - if (!queue_pages_required(page, qp)) + if (!queue_pages_required(&folio->page, qp)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* MPOL_MF_STRICT must be specified if we get here */ @@ -544,7 +544,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, * temporary off LRU pages in the range. Still * need migrate other LRU pages. */ - if (migrate_page_add(page, qp->pagelist, flags)) + if (migrate_page_add(&folio->page, qp->pagelist, flags)) has_unmovable = true; } else break; @@ -704,7 +704,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_pages_hugetlb, - .pmd_entry = queue_pages_pte_range, + .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, }; -- cgit 1.2.3-korg From 0a2c1e8183163a31fe8c9838f3108aacf9c05c4a Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:31 -0800 Subject: mm/mempolicy: convert queue_pages_hugetlb() to queue_folios_hugetlb() This change is in preparation for the conversion of queue_pages_required() to queue_folio_required() and migrate_page_add() to migrate_folio_add(). Link: https://lkml.kernel.org/r/20230130201833.27042-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b0805bb87655f5..66839249350005 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -558,7 +558,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, return addr != end ? -EIO : 0; } -static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, +static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -566,7 +566,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = (qp->flags & MPOL_MF_VALID); - struct page *page; + struct folio *folio; spinlock_t *ptl; pte_t entry; @@ -574,13 +574,13 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, entry = huge_ptep_get(pte); if (!pte_present(entry)) goto unlock; - page = pte_page(entry); - if (!queue_pages_required(page, qp)) + folio = pfn_folio(pte_pfn(entry)); + if (!queue_pages_required(&folio->page, qp)) goto unlock; if (flags == MPOL_MF_STRICT) { /* - * STRICT alone means only detecting misplaced page and no + * STRICT alone means only detecting misplaced folio and no * need to further check other vma. */ ret = -EIO; @@ -591,21 +591,28 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, /* * Must be STRICT with MOVE*, otherwise .test_walk() have * stopped walking current vma. - * Detecting misplaced page but allow migrating pages which + * Detecting misplaced folio but allow migrating folios which * have been queued. */ ret = 1; goto unlock; } - /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + /* + * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it + * is shared it is likely not worth migrating. + * + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. + */ if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && + (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(page_folio(page), qp->pagelist) && + if (isolate_hugetlb(folio, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* - * Failed to isolate page but allow migrating pages + * Failed to isolate folio but allow migrating pages * which have been queued. */ ret = 1; @@ -703,7 +710,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, } static const struct mm_walk_ops queue_pages_walk_ops = { - .hugetlb_entry = queue_pages_hugetlb, + .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, }; -- cgit 1.2.3-korg From d451b89dcd183da725eda84dfb8a46c0b32a4234 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:32 -0800 Subject: mm/mempolicy: convert queue_pages_required() to queue_folio_required() Replace queue_pages_required() with queue_folio_required(). queue_folio_required() does the same as queue_pages_required(), except takes in a folio instead of a page. Link: https://lkml.kernel.org/r/20230130201833.27042-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 66839249350005..6a68dbce3b7034 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -427,15 +427,15 @@ struct queue_pages { }; /* - * Check if the page's nid is in qp->nmask. + * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ -static inline bool queue_pages_required(struct page *page, +static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); @@ -469,7 +469,7 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, walk->action = ACTION_CONTINUE; goto unlock; } - if (!queue_pages_required(&folio->page, qp)) + if (!queue_folio_required(folio, qp)) goto unlock; flags = qp->flags; @@ -530,7 +530,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, */ if (folio_test_reserved(folio)) continue; - if (!queue_pages_required(&folio->page, qp)) + if (!queue_folio_required(folio, qp)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* MPOL_MF_STRICT must be specified if we get here */ @@ -575,7 +575,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, if (!pte_present(entry)) goto unlock; folio = pfn_folio(pte_pfn(entry)); - if (!queue_pages_required(&folio->page, qp)) + if (!queue_folio_required(folio, qp)) goto unlock; if (flags == MPOL_MF_STRICT) { -- cgit 1.2.3-korg From 4a64981dfee9119aa2c1f243b48f34cbbd67779c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:33 -0800 Subject: mm/mempolicy: convert migrate_page_add() to migrate_folio_add() Replace migrate_page_add() with migrate_folio_add(). migrate_folio_add() does the same a migrate_page_add() but takes in a folio instead of a page. This removes a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20230130201833.27042-7-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Yin Fengwei Cc: David Hildenbrand Cc: Jane Chu Signed-off-by: Andrew Morton --- mm/mempolicy.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6a68dbce3b7034..0919c7a719d419 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -414,7 +414,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { }, }; -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); struct queue_pages { @@ -476,7 +476,7 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, /* go to folio migration */ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { if (!vma_migratable(walk->vma) || - migrate_page_add(&folio->page, qp->pagelist, flags)) { + migrate_folio_add(folio, qp->pagelist, flags)) { ret = 1; goto unlock; } @@ -544,7 +544,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, * temporary off LRU pages in the range. Still * need migrate other LRU pages. */ - if (migrate_page_add(&folio->page, qp->pagelist, flags)) + if (migrate_folio_add(folio, qp->pagelist, flags)) has_unmovable = true; } else break; @@ -1021,27 +1021,28 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } #ifdef CONFIG_MIGRATION -/* - * page migration, thp tail pages can be passed. - */ -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { - struct page *head = compound_head(page); /* - * Avoid migrating a page that is shared with others. + * We try to migrate only unshared folios. If it is shared it + * is likely not worth migrating. + * + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) { - if (!isolate_lru_page(head)) { - list_add_tail(&head->lru, pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_lru(head), - thp_nr_pages(head)); + if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { + if (!folio_isolate_lru(folio)) { + list_add_tail(&folio->lru, foliolist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); } else if (flags & MPOL_MF_STRICT) { /* - * Non-movable page may reach here. And, there may be - * temporary off LRU pages or non-LRU movable pages. - * Treat them as unmovable pages since they can't be + * Non-movable folio may reach here. And, there may be + * temporary off LRU folios or non-LRU movable folios. + * Treat them as unmovable folios since they can't be * isolated, so they can't be moved at the moment. It * should return -EIO for this case too. */ @@ -1235,7 +1236,7 @@ static struct page *new_page(struct page *page, unsigned long start) } #else -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { return -EIO; -- cgit 1.2.3-korg From be2d57563822b7e00b2b16d9354637c4b6d6d5cc Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:34 +0800 Subject: mm: change to return bool for folio_isolate_lru() Patch series "Change the return value for page isolation functions", v3. Now the page isolation functions did not return a boolean to indicate success or not, instead it will return a negative error when failed to isolate a page. So below code used in most places seem a boolean success/failure thing, which can confuse people whether the isolation is successful. if (folio_isolate_lru(folio)) continue; Moreover the page isolation functions only return 0 or -EBUSY, and most users did not care about the negative error except for few users, thus we can convert all page isolation functions to return a boolean value, which can remove the confusion to make code more clear. No functional changes intended in this patch series. This patch (of 4): Now the folio_isolate_lru() did not return a boolean value to indicate isolation success or not, however below code checking the return value can make people think that it was a boolean success/failure thing, which makes people easy to make mistakes (see the fix patch[1]). if (folio_isolate_lru(folio)) continue; Thus it's better to check the negative error value expilictly returned by folio_isolate_lru(), which makes code more clear per Linus's suggestion[2]. Moreover Matthew suggested we can convert the isolation functions to return a boolean[3], since most users did not care about the negative error value, and can also remove the confusing of checking return value. So this patch converts the folio_isolate_lru() to return a boolean value, which means return 'true' to indicate the folio isolation is successful, and 'false' means a failure to isolation. Meanwhile changing all users' logic of checking the isolation state. No functional changes intended. [1] https://lore.kernel.org/all/20230131063206.28820-1-Kuan-Ying.Lee@mediatek.com/T/#u [2] https://lore.kernel.org/all/CAHk-=wiBrY+O-4=2mrbVyxR+hOqfdJ=Do6xoucfJ9_5az01L4Q@mail.gmail.com/ [3] https://lore.kernel.org/all/Y+sTFqwMNAjDvxw3@casper.infradead.org/ Link: https://lkml.kernel.org/r/cover.1676424378.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/8a4e3679ed4196168efadf7ea36c038f2f7d5aa9.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: SeongJae Park Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Cc: Johannes Weiner Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 2 +- mm/folio-compat.c | 8 +++++++- mm/gup.c | 2 +- mm/internal.h | 2 +- mm/khugepaged.c | 2 +- mm/madvise.c | 4 ++-- mm/mempolicy.c | 2 +- mm/vmscan.c | 10 +++++----- 8 files changed, 19 insertions(+), 13 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index b4df9b9bcc0a1b..607bb69e526cf9 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -246,7 +246,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) folio_clear_referenced(folio); folio_test_clear_young(folio); - if (folio_isolate_lru(folio)) { + if (!folio_isolate_lru(folio)) { folio_put(folio); continue; } diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 18c48b55792635..540373cf904e99 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -115,9 +115,15 @@ EXPORT_SYMBOL(grab_cache_page_write_begin); int isolate_lru_page(struct page *page) { + bool ret; + if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) return -EBUSY; - return folio_isolate_lru((struct folio *)page); + ret = folio_isolate_lru((struct folio *)page); + if (ret) + return 0; + + return -EBUSY; } void putback_lru_page(struct page *page) diff --git a/mm/gup.c b/mm/gup.c index b0885f70579c5a..eab18ba045dbef 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1939,7 +1939,7 @@ static unsigned long collect_longterm_unpinnable_pages( drain_allow = false; } - if (folio_isolate_lru(folio)) + if (!folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_page_list); diff --git a/mm/internal.h b/mm/internal.h index dfb37e94e140c6..8645e8496537fe 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -188,7 +188,7 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, * in mm/vmscan.c: */ int isolate_lru_page(struct page *page); -int folio_isolate_lru(struct folio *folio); +bool folio_isolate_lru(struct folio *folio); void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index bd54b957f69a7f..15eebab0fbb5b5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1950,7 +1950,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (folio_isolate_lru(folio)) { + if (!folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } diff --git a/mm/madvise.c b/mm/madvise.c index 5a5a687d03c24d..c2202f51e9ddf6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -406,7 +406,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio_clear_referenced(folio); folio_test_clear_young(folio); if (pageout) { - if (!folio_isolate_lru(folio)) { + if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else @@ -500,7 +500,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio_clear_referenced(folio); folio_test_clear_young(folio); if (pageout) { - if (!folio_isolate_lru(folio)) { + if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0919c7a719d419..2751bc3310fd67 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1033,7 +1033,7 @@ static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, * expensive, so check the estimated mapcount of the folio instead. */ if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { - if (!folio_isolate_lru(folio)) { + if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), diff --git a/mm/vmscan.c b/mm/vmscan.c index 098c79129c426b..9c1c5e8b24b8f5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2337,12 +2337,12 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, * (2) The lru_lock must not be held. * (3) Interrupts must be enabled. * - * Return: 0 if the folio was removed from an LRU list. - * -EBUSY if the folio was not on an LRU list. + * Return: true if the folio was removed from an LRU list. + * false if the folio was not on an LRU list. */ -int folio_isolate_lru(struct folio *folio) +bool folio_isolate_lru(struct folio *folio) { - int ret = -EBUSY; + bool ret = false; VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); @@ -2353,7 +2353,7 @@ int folio_isolate_lru(struct folio *folio) lruvec = folio_lruvec_lock_irq(folio); lruvec_del_folio(lruvec, folio); unlock_page_lruvec_irq(lruvec); - ret = 0; + ret = true; } return ret; -- cgit 1.2.3-korg From 9747b9e92418b61c2281561e0651803f1fad0159 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:36 +0800 Subject: mm: hugetlb: change to return bool for isolate_hugetlb() Now the isolate_hugetlb() only returns 0 or -EBUSY, and most users did not care about the negative value, thus we can convert the isolate_hugetlb() to return a boolean value to make code more clear when checking the hugetlb isolation state. Moreover converts 2 users which will consider the negative value returned by isolate_hugetlb(). No functional changes intended. [akpm@linux-foundation.org: shorten locked section, per SeongJae Park] Link: https://lkml.kernel.org/r/12a287c5bebc13df304387087bbecc6421510849.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Acked-by: Linus Torvalds Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +++--- mm/hugetlb.c | 13 ++++++++----- mm/memory-failure.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 7 +++---- 5 files changed, 16 insertions(+), 14 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index df6dd624ccfe8a..5f5e4177b2e0c3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,7 +171,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int isolate_hugetlb(struct folio *folio, struct list_head *list); +bool isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -413,9 +413,9 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline int isolate_hugetlb(struct folio *folio, struct list_head *list) +static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) { - return -EBUSY; + return false; } static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3a01a9dbf445a3..07abcb6eb20304 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2925,12 +2925,15 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, */ goto free_new; } else if (folio_ref_count(old_folio)) { + bool isolated; + /* * Someone has grabbed the folio, try to isolate it here. * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(old_folio, list); + isolated = isolate_hugetlb(old_folio, list); + ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { @@ -3005,7 +3008,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && !isolate_hugetlb(folio, list)) + if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -7251,15 +7254,15 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -int isolate_hugetlb(struct folio *folio, struct list_head *list) +bool isolate_hugetlb(struct folio *folio, struct list_head *list) { - int ret = 0; + bool ret = true; spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio) || !folio_test_hugetlb_migratable(folio) || !folio_try_get(folio)) { - ret = -EBUSY; + ret = false; goto unlock; } folio_clear_hugetlb_migratable(folio); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e504362fdb2350..8604753bc64449 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2508,7 +2508,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool isolated = false; if (PageHuge(page)) { - isolated = !isolate_hugetlb(page_folio(page), pagelist); + isolated = isolate_hugetlb(page_folio(page), pagelist); } else { bool lru = !__PageMovable(page); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2751bc3310fd67..a256a241fd1dae 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -609,7 +609,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(folio, qp->pagelist) && + if (!isolate_hugetlb(folio, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate folio but allow migrating pages diff --git a/mm/migrate.c b/mm/migrate.c index 53010a142e7feb..2db546a0618cdd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2095,6 +2095,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, struct vm_area_struct *vma; struct page *page; int err; + bool isolated; mmap_read_lock(mm); err = -EFAULT; @@ -2126,13 +2127,11 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - err = isolate_hugetlb(page_folio(page), pagelist); - if (!err) - err = 1; + isolated = isolate_hugetlb(page_folio(page), pagelist); + err = isolated ? 1 : -EBUSY; } } else { struct page *head; - bool isolated; head = compound_head(page); isolated = isolate_lru_page(head); -- cgit 1.2.3-korg