From 1eba86c096e35e3cc83de1ad2c26f2d70470211b Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Jan 2022 14:06:29 -0800 Subject: mm: change page type prior to adding page table entry Patch series "page table check", v3. Ensure that some memory corruptions are prevented by checking at the time of insertion of entries into user page tables that there is no illegal sharing. We have recently found a problem [1] that existed in kernel since 4.14. The problem was caused by broken page ref count and led to memory leaking from one process into another. The problem was accidentally detected by studying a dump of one process and noticing that one page contains memory that should not belong to this process. There are some other page->_refcount related problems that were recently fixed: [2], [3] which potentially could also lead to illegal sharing. In addition to hardening refcount [4] itself, this work is an attempt to prevent this class of memory corruption issues. It uses a simple state machine that is independent from regular MM logic to check for illegal sharing at time pages are inserted and removed from page tables. [1] https://lore.kernel.org/all/xr9335nxwc5y.fsf@gthelen2.svl.corp.google.com [2] https://lore.kernel.org/all/1582661774-30925-2-git-send-email-akaher@vmware.com [3] https://lore.kernel.org/all/20210622021423.154662-3-mike.kravetz@oracle.com [4] https://lore.kernel.org/all/20211221150140.988298-1-pasha.tatashin@soleen.com This patch (of 4): There are a few places where we first update the entry in the user page table, and later change the struct page to indicate that this is anonymous or file page. In most places, however, we first configure the page metadata and then insert entries into the page table. Page table check, will use the information from struct page to verify the type of entry is inserted. Change the order in all places to first update struct page, and later to update page table. This means that we first do calls that may change the type of page (anon or file): page_move_anon_rmap page_add_anon_rmap do_page_add_anon_rmap page_add_new_anon_rmap page_add_file_rmap hugepage_add_anon_rmap hugepage_add_new_anon_rmap And after that do calls that add entries to the page table: set_huge_pte_at set_pte_at Link: https://lkml.kernel.org/r/20211221154650.1047963-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20211221154650.1047963-2-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: David Rientjes Cc: Paul Turner Cc: Wei Xu Cc: Greg Thelen Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Will Deacon Cc: Mike Rapoport Cc: Kees Cook Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Masahiro Yamada Cc: Sami Tolvanen Cc: Dave Hansen Cc: Frederic Weisbecker Cc: "H. Peter Anvin" Cc: Aneesh Kumar K.V Cc: Jiri Slaby Cc: Muchun Song Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index e59e08ef46e15f..e64207e2ef1dd4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1917,14 +1917,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); } + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); swap_free(entry); out: pte_unmap_unlock(pte, ptl); -- cgit 1.2.3-korg From 020e87650af9f43683546729f959fdc78422a4b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:44 -0800 Subject: mm: remove last argument of reuse_swap_page() None of the callers care about the total_map_swapcount() any more. Link: https://lkml.kernel.org/r/20211220205943.456187-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Reviewed-by: William Kucharski Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 +++--- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/memory.c | 2 +- mm/swapfile.c | 8 +------- 5 files changed, 7 insertions(+), 13 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index d1ea44b31f19f5..bdccbf1efa6198 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -514,7 +514,7 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern bool reuse_swap_page(struct page *, int *); +extern bool reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); @@ -680,8 +680,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page, total_map_swapcount) \ - (page_trans_huge_mapcount(page, total_map_swapcount) == 1) +#define reuse_swap_page(page) \ + (page_trans_huge_mapcount(page, NULL) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e5483347291c07..b61fbe95c856c7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1322,7 +1322,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) * We can only reuse the page if nobody else maps the huge page or it's * part. */ - if (reuse_swap_page(page, NULL)) { + if (reuse_swap_page(page)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9d40dd8890e581..698ea19775ac31 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -681,7 +681,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } if (!pte_write(pteval) && PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { + !reuse_swap_page(page)) { /* * Page is in the swap cache and cannot be re-used. * It cannot be collapsed into a THP. diff --git a/mm/memory.c b/mm/memory.c index 5fea331b15607d..571d02f419baa4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3627,7 +3627,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; diff --git a/mm/swapfile.c b/mm/swapfile.c index e64207e2ef1dd4..31d13a393cf0f2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1668,12 +1668,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. - * - * NOTE: total_map_swapcount should not be relied upon by the caller if - * reuse_swap_page() returns false, but it may be always overwritten - * (see the other implementation for CONFIG_SWAP=n). */ -bool reuse_swap_page(struct page *page, int *total_map_swapcount) +bool reuse_swap_page(struct page *page) { int count, total_mapcount, total_swapcount; @@ -1682,8 +1678,6 @@ bool reuse_swap_page(struct page *page, int *total_map_swapcount) return false; count = page_trans_huge_map_swapcount(page, &total_mapcount, &total_swapcount); - if (total_map_swapcount) - *total_map_swapcount = total_mapcount + total_swapcount; if (count == 1 && PageSwapCache(page) && (likely(!PageTransCompound(page)) || /* The remaining swap count will be freed soon */ -- cgit 1.2.3-korg From 66c7f7a6ac6624fc7e226d43913e10f1f047f579 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:48 -0800 Subject: mm: remove the total_mapcount argument from page_trans_huge_map_swapcount() Now that we don't report it to the caller of reuse_swap_page(), we don't need to request it from page_trans_huge_map_swapcount(). Link: https://lkml.kernel.org/r/20211220205943.456187-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Linus Torvalds Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 31d13a393cf0f2..a93f5b5fc8b687 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1601,31 +1601,30 @@ static bool page_swapped(struct page *page) return false; } -static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, +static int page_trans_huge_map_swapcount(struct page *page, int *total_swapcount) { - int i, map_swapcount, _total_mapcount, _total_swapcount; + int i, map_swapcount, _total_swapcount; unsigned long offset = 0; struct swap_info_struct *si; struct swap_cluster_info *ci = NULL; unsigned char *map = NULL; - int mapcount, swapcount = 0; + int swapcount = 0; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { - mapcount = page_trans_huge_mapcount(page, total_mapcount); if (PageSwapCache(page)) swapcount = page_swapcount(page); if (total_swapcount) *total_swapcount = swapcount; - return mapcount + swapcount; + return swapcount + page_trans_huge_mapcount(page, NULL); } page = compound_head(page); - _total_mapcount = _total_swapcount = map_swapcount = 0; + _total_swapcount = map_swapcount = 0; if (PageSwapCache(page)) { swp_entry_t entry; @@ -1639,8 +1638,7 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, if (map) ci = lock_cluster(si, offset); for (i = 0; i < HPAGE_PMD_NR; i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; - _total_mapcount += mapcount; + int mapcount = atomic_read(&page[i]._mapcount) + 1; if (map) { swapcount = swap_count(map[offset + i]); _total_swapcount += swapcount; @@ -1648,19 +1646,14 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, map_swapcount = max(map_swapcount, mapcount + swapcount); } unlock_cluster(ci); - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) map_swapcount -= 1; - _total_mapcount -= HPAGE_PMD_NR; - } - mapcount = compound_mapcount(page); - map_swapcount += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; + if (total_swapcount) *total_swapcount = _total_swapcount; - return map_swapcount; + return map_swapcount + compound_mapcount(page); } /* @@ -1671,13 +1664,12 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, */ bool reuse_swap_page(struct page *page) { - int count, total_mapcount, total_swapcount; + int count, total_swapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return false; - count = page_trans_huge_map_swapcount(page, &total_mapcount, - &total_swapcount); + count = page_trans_huge_map_swapcount(page, &total_swapcount); if (count == 1 && PageSwapCache(page) && (likely(!PageTransCompound(page)) || /* The remaining swap count will be freed soon */ -- cgit 1.2.3-korg From d08d2b62510e2407cf939e693aefd179dc114913 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:51 -0800 Subject: mm: remove the total_mapcount argument from page_trans_huge_mapcount() All callers pass NULL, so we can stop calculating the value we would store in it. Link: https://lkml.kernel.org/r/20211220205943.456187-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Linus Torvalds Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +++------- include/linux/swap.h | 2 +- mm/huge_memory.c | 30 ++++++++++-------------------- mm/swapfile.c | 2 +- 4 files changed, 15 insertions(+), 29 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4d7245e6802a65..cef65f9cbdf253 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -799,19 +799,15 @@ static inline int page_mapcount(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); -int page_trans_huge_mapcount(struct page *page, int *total_mapcount); +int page_trans_huge_mapcount(struct page *page); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } -static inline int page_trans_huge_mapcount(struct page *page, - int *total_mapcount) +static inline int page_trans_huge_mapcount(struct page *page) { - int mapcount = page_mapcount(page); - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; + return page_mapcount(page); } #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index bdccbf1efa6198..1d38d9475c4d08 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -681,7 +681,7 @@ static inline int swp_swapcount(swp_entry_t entry) } #define reuse_swap_page(page) \ - (page_trans_huge_mapcount(page, NULL) == 1) + (page_trans_huge_mapcount(page) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b61fbe95c856c7..6ed86a8f6a5bec 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2542,38 +2542,28 @@ int total_mapcount(struct page *page) * need full accuracy to avoid breaking page pinning, because * page_trans_huge_mapcount() is slower than page_mapcount(). */ -int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +int page_trans_huge_mapcount(struct page *page) { - int i, ret, _total_mapcount, mapcount; + int i, ret; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); - if (likely(!PageTransCompound(page))) { - mapcount = atomic_read(&page->_mapcount) + 1; - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; - } + if (likely(!PageTransCompound(page))) + return atomic_read(&page->_mapcount) + 1; page = compound_head(page); - _total_mapcount = ret = 0; + ret = 0; for (i = 0; i < thp_nr_pages(page); i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; + int mapcount = atomic_read(&page[i]._mapcount) + 1; ret = max(ret, mapcount); - _total_mapcount += mapcount; } - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) ret -= 1; - _total_mapcount -= thp_nr_pages(page); - } - mapcount = compound_mapcount(page); - ret += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; - return ret; + + return ret + compound_mapcount(page); } /* Racy check whether the huge page can be split */ diff --git a/mm/swapfile.c b/mm/swapfile.c index a93f5b5fc8b687..caa9f81a0d15fa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1619,7 +1619,7 @@ static int page_trans_huge_map_swapcount(struct page *page, swapcount = page_swapcount(page); if (total_swapcount) *total_swapcount = swapcount; - return swapcount + page_trans_huge_mapcount(page, NULL); + return swapcount + page_trans_huge_mapcount(page); } page = compound_head(page); -- cgit 1.2.3-korg