diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/frame_vector.c | 2 | ||||
| -rw-r--r-- | mm/gup.c | 4 | ||||
| -rw-r--r-- | mm/internal.h | 2 | ||||
| -rw-r--r-- | mm/madvise.c | 268 | ||||
| -rw-r--r-- | mm/memcontrol.c | 10 | ||||
| -rw-r--r-- | mm/mempolicy.c | 3 | ||||
| -rw-r--r-- | mm/migrate.c | 2 | ||||
| -rw-r--r-- | mm/mincore.c | 2 | ||||
| -rw-r--r-- | mm/mlock.c | 4 | ||||
| -rw-r--r-- | mm/mmap.c | 34 | ||||
| -rw-r--r-- | mm/mprotect.c | 2 | ||||
| -rw-r--r-- | mm/mremap.c | 3 | ||||
| -rw-r--r-- | mm/msync.c | 2 | ||||
| -rw-r--r-- | mm/oom_kill.c | 2 | ||||
| -rw-r--r-- | mm/swap.c | 42 | ||||
| -rw-r--r-- | mm/vmalloc.c | 5 | ||||
| -rw-r--r-- | mm/vmscan.c | 62 |
17 files changed, 427 insertions, 22 deletions
diff --git a/mm/frame_vector.c b/mm/frame_vector.c index c64dca6e27c28c..c431ca81dad551 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -46,6 +46,8 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) nr_frames = vec->nr_allocated; + start = untagged_addr(start); + down_read(&mm->mmap_sem); locked = 1; vma = find_vma_intersection(mm, start, start + 1); diff --git a/mm/gup.c b/mm/gup.c index 60c3915c8ee6c6..23a9f9c9d37754 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -788,6 +788,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (!nr_pages) return 0; + start = untagged_addr(start); + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); /* @@ -950,6 +952,8 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; vm_fault_t ret, major = 0; + address = untagged_addr(address); + if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY; diff --git a/mm/internal.h b/mm/internal.h index e32390802fd3f1..0d5f720c75abf0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -39,7 +39,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma) +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); } diff --git a/mm/madvise.c b/mm/madvise.c index 68ab988ad4333b..2be9f3fdb05e7e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,6 +11,7 @@ #include <linux/syscalls.h> #include <linux/mempolicy.h> #include <linux/page-isolation.h> +#include <linux/page_idle.h> #include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> @@ -31,6 +32,11 @@ #include "internal.h" +struct madvise_walk_private { + struct mmu_gather *tlb; + bool pageout; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_sem for writing. Others, which simply traverse vmas, need @@ -42,6 +48,8 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_COLD: + case MADV_PAGEOUT: case MADV_FREE: return 0; default: @@ -289,6 +297,254 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct madvise_walk_private *private = walk->private; + struct mmu_gather *tlb = private->tlb; + bool pageout = private->pageout; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + pte_t *orig_pte, *pte, ptent; + spinlock_t *ptl; + struct page *page = NULL; + LIST_HEAD(page_list); + + if (fatal_signal_pending(current)) + return -EINTR; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t orig_pmd; + unsigned long next = pmd_addr_end(addr, end); + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) + goto huge_unlock; + + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto huge_unlock; + } + + page = pmd_page(orig_pmd); + if (next - addr != HPAGE_PMD_SIZE) { + int err; + + if (page_mapcount(page) != 1) + goto huge_unlock; + + get_page(page); + spin_unlock(ptl); + lock_page(page); + err = split_huge_page(page); + unlock_page(page); + put_page(page); + if (!err) + goto regular_page; + return 0; + } + + if (pmd_young(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); + orig_pmd = pmd_mkold(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + } + + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (pageout) { + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); + } else + deactivate_page(page); +huge_unlock: + spin_unlock(ptl); + if (pageout) + reclaim_pages(&page_list); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; +regular_page: +#endif + tlb_change_page_size(tlb, PAGE_SIZE); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * Creating a THP page is expensive so split it only if we + * are sure it's worth. Split it if we are only owner. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + break; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + break; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + break; + } + unlock_page(page); + put_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (pte_young(ptent)) { + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + + /* + * We are deactivating a page for accelerating reclaiming. + * VM couldn't reclaim the page unless we clear PG_young. + * As a side effect, it makes confuse idle-page tracking + * because they will miss recent referenced history. + */ + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (pageout) { + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); + } else + deactivate_page(page); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + if (pageout) + reclaim_pages(&page_list); + cond_resched(); + + return 0; +} + +static const struct mm_walk_ops cold_walk_ops = { + .pmd_entry = madvise_cold_or_pageout_pte_range, +}; + +static void madvise_cold_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct madvise_walk_private walk_private = { + .pageout = false, + .tlb = tlb, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); + tlb_end_vma(tlb, vma); +} + +static long madvise_cold(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_cold_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb, start_addr, end_addr); + + return 0; +} + +static void madvise_pageout_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct madvise_walk_private walk_private = { + .pageout = true, + .tlb = tlb, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); + tlb_end_vma(tlb, vma); +} + +static inline bool can_do_pageout(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * paging out pagecache only for non-anonymous mappings that correspond + * to the files the calling process could (if tried) open for writing; + * otherwise we'd be including shared non-exclusive mappings, which + * opens a side channel. + */ + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + +static long madvise_pageout(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + if (!can_do_pageout(vma)) + return 0; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb, start_addr, end_addr); + + return 0; +} + static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -493,7 +749,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, int behavior) { *prev = vma; - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) return -EINVAL; if (!userfaultfd_remove(vma, start, end)) { @@ -515,7 +771,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ return -ENOMEM; } - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) return -EINVAL; if (end > vma->vm_end) { /* @@ -669,6 +925,10 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_COLD: + return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); @@ -690,6 +950,8 @@ madvise_behavior_valid(int behavior) case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: + case MADV_COLD: + case MADV_PAGEOUT: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: @@ -784,6 +1046,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) size_t len; struct blk_plug plug; + start = untagged_addr(start); + if (!madvise_behavior_valid(behavior)) return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2156ef775d04be..c313c49074cad4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2943,6 +2943,16 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { + + /* + * Enforce __GFP_NOFAIL allocation because callers are not + * prepared to see failures and likely do not have any failure + * handling code. + */ + if (gfp & __GFP_NOFAIL) { + page_counter_charge(&memcg->kmem, nr_pages); + return 0; + } cancel_charge(memcg, nr_pages); return -ENOMEM; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 464406e8da9109..de27d08b1ff8d2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1405,6 +1405,7 @@ static long kernel_mbind(unsigned long start, unsigned long len, int err; unsigned short mode_flags; + start = untagged_addr(start); mode_flags = mode & MPOL_MODE_FLAGS; mode &= ~MPOL_MODE_FLAGS; if (mode >= MPOL_MAX) @@ -1558,6 +1559,8 @@ static int kernel_get_mempolicy(int __user *policy, int uninitialized_var(pval); nodemask_t nodes; + addr = untagged_addr(addr); + if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; diff --git a/mm/migrate.c b/mm/migrate.c index 73d476d690b105..4fe45d1428c867 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1612,7 +1612,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, goto out_flush; if (get_user(node, nodes + i)) goto out_flush; - addr = (unsigned long)p; + addr = (unsigned long)untagged_addr(p); err = -ENODEV; if (node < 0 || node >= MAX_NUMNODES) diff --git a/mm/mincore.c b/mm/mincore.c index f9a9dbe8cd3301..49b6fa2f6aa1d6 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -256,6 +256,8 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, unsigned long pages; unsigned char *tmp; + start = untagged_addr(start); + /* Check the start address: needs to be page-aligned.. */ if (start & ~PAGE_MASK) return -EINVAL; diff --git a/mm/mlock.c b/mm/mlock.c index a90099da4fb416..a72c1eeded7729 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -674,6 +674,8 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla unsigned long lock_limit; int error = -ENOMEM; + start = untagged_addr(start); + if (!can_do_mlock()) return -EPERM; @@ -735,6 +737,8 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; + start = untagged_addr(start); + len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; diff --git a/mm/mmap.c b/mm/mmap.c index f1e8c7f93e04c6..a7d8c84d19b712 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -201,6 +201,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool downgraded = false; LIST_HEAD(uf); + brk = untagged_addr(brk); + if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -289,9 +291,9 @@ out: return retval; } -static long vma_compute_subtree_gap(struct vm_area_struct *vma) +static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) { - unsigned long max, prev_end, subtree_gap; + unsigned long gap, prev_end; /* * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we @@ -299,14 +301,21 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) * an unmapped area; whereas when expanding we only require one. * That's a little inconsistent, but keeps the code here simpler. */ - max = vm_start_gap(vma); + gap = vm_start_gap(vma); if (vma->vm_prev) { prev_end = vm_end_gap(vma->vm_prev); - if (max > prev_end) - max -= prev_end; + if (gap > prev_end) + gap -= prev_end; else - max = 0; + gap = 0; } + return gap; +} + +#ifdef CONFIG_DEBUG_VM_RB +static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma) +{ + unsigned long max = vma_compute_gap(vma), subtree_gap; if (vma->vm_rb.rb_left) { subtree_gap = rb_entry(vma->vm_rb.rb_left, struct vm_area_struct, vm_rb)->rb_subtree_gap; @@ -322,7 +331,6 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) return max; } -#ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct mm_struct *mm) { struct rb_root *root = &mm->mm_rb; @@ -428,8 +436,9 @@ static void validate_mm(struct mm_struct *mm) #define validate_mm(mm) do { } while (0) #endif -RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, - unsigned long, rb_subtree_gap, vma_compute_subtree_gap) +RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks, + struct vm_area_struct, vm_rb, + unsigned long, rb_subtree_gap, vma_compute_gap) /* * Update augmented rbtree rb_subtree_gap values after vma->vm_start or @@ -439,8 +448,8 @@ RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, static void vma_gap_update(struct vm_area_struct *vma) { /* - * As it turns out, RB_DECLARE_CALLBACKS() already created a callback - * function that does exactly what we want. + * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created + * a callback function that does exactly what we want. */ vma_gap_callbacks_propagate(&vma->vm_rb, NULL); } @@ -1580,6 +1589,8 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval; + addr = untagged_addr(addr); + if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); @@ -2878,6 +2889,7 @@ EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { + addr = untagged_addr(addr); profile_munmap(addr); return __vm_munmap(addr, len, true); } diff --git a/mm/mprotect.c b/mm/mprotect.c index 675e5d34a50773..7967825f6d3352 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -459,6 +459,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); + start = untagged_addr(start); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ return -EINVAL; diff --git a/mm/mremap.c b/mm/mremap.c index fc241d23cd97ab..1fc8a29fbe3f44 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -606,6 +606,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); + addr = untagged_addr(addr); + new_addr = untagged_addr(new_addr); + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; diff --git a/mm/msync.c b/mm/msync.c index ef30a429623a2f..c3bd3e75f68764 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -37,6 +37,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) int unmapped_error = 0; int error = -EINVAL; + start = untagged_addr(start); + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; if (offset_in_page(start)) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c1d9496b4c4326..71e3acea781764 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -523,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) set_bit(MMF_UNSTABLE, &mm->flags); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) continue; /* diff --git a/mm/swap.c b/mm/swap.c index 784dc162062004..38c3fa4308e210 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -47,6 +47,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); @@ -538,6 +539,22 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, update_page_reclaim_stat(lruvec, file, 0); } +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + + del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(page, lruvec, lru); + + __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page)); + update_page_reclaim_stat(lruvec, file, 0); + } +} static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) @@ -590,6 +607,10 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pvec = &per_cpu(lru_lazyfree_pvecs, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); @@ -623,6 +644,26 @@ void deactivate_file_page(struct page *page) } } +/* + * deactivate_page - deactivate a page + * @page: page to deactivate + * + * deactivate_page() moves @page to the inactive list if @page was on the active + * list and was not an unevictable page. This is done to accelerate the reclaim + * of @page. + */ +void deactivate_page(struct page *page) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + get_page(page); + if (!pagevec_add(pvec, page) || PageCompound(page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } +} + /** * mark_page_lazyfree - make an anon page lazyfree * @page: page to deactivate @@ -687,6 +728,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fcadd3e25c0c8e..a3c70e275f4e53 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -396,9 +396,8 @@ compute_subtree_max_size(struct vmap_area *va) get_subtree_max_size(va->rb_node.rb_right)); } -RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb, - struct vmap_area, rb_node, unsigned long, subtree_max_size, - compute_subtree_max_size) +RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, + struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); diff --git a/mm/vmscan.c b/mm/vmscan.c index 4911754c93b7ed..e5d52d6a24aff1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1123,7 +1123,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct scan_control *sc, enum ttu_flags ttu_flags, struct reclaim_stat *stat, - bool force_reclaim) + bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -1137,7 +1137,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct address_space *mapping; struct page *page; int may_enter_fs; - enum page_references references = PAGEREF_RECLAIM_CLEAN; + enum page_references references = PAGEREF_RECLAIM; bool dirty, writeback; unsigned int nr_pages; @@ -1268,7 +1268,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - if (!force_reclaim) + if (!ignore_references) references = page_check_references(page, sc); switch (references) { @@ -2145,6 +2145,62 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } +unsigned long reclaim_pages(struct list_head *page_list) +{ + int nid = -1; + unsigned long nr_reclaimed = 0; + LIST_HEAD(node_page_list); + struct reclaim_stat dummy_stat; + struct page *page; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + }; + + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + if (nid == -1) { + nid = page_to_nid(page); + INIT_LIST_HEAD(&node_page_list); + } + + if (nid == page_to_nid(page)) { + ClearPageActive(page); + list_move(&page->lru, &node_page_list); + continue; + } + + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + nid = -1; + } + + if (!list_empty(&node_page_list)) { + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + } + + return nr_reclaimed; +} + /* * The inactive anon list should be small enough that the VM never has * to do too much work. |
