From e709accc7670a07b2e5186449f0640c2416662ec Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 24 Jul 2019 08:52:58 +0200 Subject: mm/hmm: comment on VM_FAULT_RETRY semantics in handle_mm_fault The magic dropping of mmap_sem when handle_mm_fault returns VM_FAULT_RETRY is rather subtile. Add a comment explaining it. Link: https://lore.kernel.org/r/20190724065258.16603-8-hch@lst.de Tested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe [hch: wrote a changelog] Signed-off-by: Christoph Hellwig --- mm/hmm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/hmm.c') diff --git a/mm/hmm.c b/mm/hmm.c index 16b6731a34db79..54b3a4162ae949 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -301,8 +301,10 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; flags |= write_fault ? FAULT_FLAG_WRITE : 0; ret = handle_mm_fault(vma, addr, flags); - if (ret & VM_FAULT_RETRY) + if (ret & VM_FAULT_RETRY) { + /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ return -EAGAIN; + } if (ret & VM_FAULT_ERROR) { *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; -- cgit 1.2.3-korg From 1f961807925032daa90267d8a23ff730e7ede07a Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 25 Jul 2019 17:56:44 -0700 Subject: mm/hmm: replace hmm_update with mmu_notifier_range The hmm_mirror_ops callback function sync_cpu_device_pagetables() passes a struct hmm_update which is a simplified version of struct mmu_notifier_range. This is unnecessary so replace hmm_update with mmu_notifier_range directly. Link: https://lore.kernel.org/r/20190726005650.2566-2-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed: Christoph Hellwig Reviewed-by: Jason Gunthorpe [jgg: white space tuning] Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 14 ++++++++------ drivers/gpu/drm/nouveau/nouveau_svm.c | 4 ++-- include/linux/hmm.h | 34 ++++++---------------------------- mm/hmm.c | 13 ++++--------- 4 files changed, 20 insertions(+), 45 deletions(-) (limited to 'mm/hmm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 3971c201f3205b..b698b423b25d31 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -195,13 +195,14 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, * Block for operations on BOs to finish and mark pages as accessed and * potentially dirty. */ -static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, - const struct hmm_update *update) +static int +amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, + const struct mmu_notifier_range *update) { struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); unsigned long start = update->start; unsigned long end = update->end; - bool blockable = update->blockable; + bool blockable = mmu_notifier_range_blockable(update); struct interval_tree_node *it; /* notification is exclusive, but interval is inclusive */ @@ -243,13 +244,14 @@ static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, * necessitates evicting all user-mode queues of the process. The BOs * are restorted in amdgpu_mn_invalidate_range_end_hsa. */ -static int amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, - const struct hmm_update *update) +static int +amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, + const struct mmu_notifier_range *update) { struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); unsigned long start = update->start; unsigned long end = update->end; - bool blockable = update->blockable; + bool blockable = mmu_notifier_range_blockable(update); struct interval_tree_node *it; /* notification is exclusive, but interval is inclusive */ diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 545100f7c59488..79b29c918717cf 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -252,13 +252,13 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit) static int nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror, - const struct hmm_update *update) + const struct mmu_notifier_range *update) { struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror); unsigned long start = update->start; unsigned long limit = update->end; - if (!update->blockable) + if (!mmu_notifier_range_blockable(update)) return -EAGAIN; SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit); diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 9f32586684c9c3..4f870f7017cb52 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -340,29 +340,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, struct hmm_mirror; -/* - * enum hmm_update_event - type of update - * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why) - */ -enum hmm_update_event { - HMM_UPDATE_INVALIDATE, -}; - -/* - * struct hmm_update - HMM update information for callback - * - * @start: virtual start address of the range to update - * @end: virtual end address of the range to update - * @event: event triggering the update (what is happening) - * @blockable: can the callback block/sleep ? - */ -struct hmm_update { - unsigned long start; - unsigned long end; - enum hmm_update_event event; - bool blockable; -}; - /* * struct hmm_mirror_ops - HMM mirror device operations callback * @@ -383,9 +360,9 @@ struct hmm_mirror_ops { /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror - * @update: update information (see struct hmm_update) - * Return: -EAGAIN if update.blockable false and callback need to - * block, 0 otherwise. + * @update: update information (see struct mmu_notifier_range) + * Return: -EAGAIN if mmu_notifier_range_blockable(update) is false + * and callback needs to block, 0 otherwise. * * This callback ultimately originates from mmu_notifiers when the CPU * page table is updated. The device driver must update its page table @@ -396,8 +373,9 @@ struct hmm_mirror_ops { * page tables are completely updated (TLBs flushed, etc); this is a * synchronous call. */ - int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, - const struct hmm_update *update); + int (*sync_cpu_device_pagetables)( + struct hmm_mirror *mirror, + const struct mmu_notifier_range *update); }; /* diff --git a/mm/hmm.c b/mm/hmm.c index 54b3a4162ae949..4040b44276353b 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -165,7 +165,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; - struct hmm_update update; struct hmm_range *range; unsigned long flags; int ret = 0; @@ -173,15 +172,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, if (!kref_get_unless_zero(&hmm->kref)) return 0; - update.start = nrange->start; - update.end = nrange->end; - update.event = HMM_UPDATE_INVALIDATE; - update.blockable = mmu_notifier_range_blockable(nrange); - spin_lock_irqsave(&hmm->ranges_lock, flags); hmm->notifiers++; list_for_each_entry(range, &hmm->ranges, list) { - if (update.end < range->start || update.start >= range->end) + if (nrange->end < range->start || nrange->start >= range->end) continue; range->valid = false; @@ -198,9 +192,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, list_for_each_entry(mirror, &hmm->mirrors, list) { int rc; - rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update); + rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange); if (rc) { - if (WARN_ON(update.blockable || rc != -EAGAIN)) + if (WARN_ON(mmu_notifier_range_blockable(nrange) || + rc != -EAGAIN)) continue; ret = -EAGAIN; break; -- cgit 1.2.3-korg From d2e8d551165ccb6669a0d0273be01ac99d61deba Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 25 Jul 2019 17:56:45 -0700 Subject: mm/hmm: a few more C style and comment clean ups A few more comments and minor programming style clean ups. There should be no functional changes. Link: https://lore.kernel.org/r/20190726005650.2566-3-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'mm/hmm.c') diff --git a/mm/hmm.c b/mm/hmm.c index 4040b44276353b..362944b0fbca57 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -32,7 +32,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; * hmm_get_or_create - register HMM against an mm (HMM internal) * * @mm: mm struct to attach to - * Returns: returns an HMM object, either by referencing the existing + * Return: an HMM object, either by referencing the existing * (per-process) object, or by creating a new one. * * This is not intended to be used directly by device drivers. If mm already @@ -325,8 +325,8 @@ static int hmm_pfns_bad(unsigned long addr, } /* - * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) - * @start: range virtual start address (inclusive) + * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) + * @addr: range virtual start address (inclusive) * @end: range virtual end address (exclusive) * @fault: should we fault or not ? * @write_fault: write fault ? @@ -376,9 +376,9 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, /* * So we not only consider the individual per page request we also * consider the default flags requested for the range. The API can - * be use in 2 fashions. The first one where the HMM user coalesce - * multiple page fault into one request and set flags per pfns for - * of those faults. The second one where the HMM user want to pre- + * be used 2 ways. The first one where the HMM user coalesces + * multiple page faults into one request and sets flags per pfn for + * those faults. The second one where the HMM user wants to pre- * fault a range with specific flags. For the latter one it is a * waste to have the user pre-fill the pfn arrays with a default * flags value. @@ -388,7 +388,7 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, /* We aren't ask to do anything ... */ if (!(pfns & range->flags[HMM_PFN_VALID])) return; - /* If this is device memory than only fault if explicitly requested */ + /* If this is device memory then only fault if explicitly requested */ if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { /* Do we fault on device memory ? */ if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { @@ -502,7 +502,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, hmm_vma_walk->last = end; return 0; #else - /* If THP is not enabled then we should never reach that code ! */ + /* If THP is not enabled then we should never reach this code ! */ return -EINVAL; #endif } @@ -522,7 +522,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; bool fault, write_fault; uint64_t cpu_flags; pte_t pte = *ptep; @@ -571,8 +570,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (fault || write_fault) { pte_unmap(ptep); hmm_vma_walk->last = addr; - migration_entry_wait(vma->vm_mm, - pmdp, addr); + migration_entry_wait(walk->mm, pmdp, addr); return -EBUSY; } return 0; @@ -620,13 +618,11 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; uint64_t *pfns = range->pfns; unsigned long addr = start, i; pte_t *ptep; pmd_t pmd; - again: pmd = READ_ONCE(*pmdp); if (pmd_none(pmd)) @@ -648,7 +644,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, 0, &fault, &write_fault); if (fault || write_fault) { hmm_vma_walk->last = addr; - pmd_migration_entry_wait(vma->vm_mm, pmdp); + pmd_migration_entry_wait(walk->mm, pmdp); return -EBUSY; } return 0; @@ -657,11 +653,11 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* - * No need to take pmd_lock here, even if some other threads + * No need to take pmd_lock here, even if some other thread * is splitting the huge pmd we will get that event through * mmu_notifier callback. * - * So just read pmd value and check again its a transparent + * So just read pmd value and check again it's a transparent * huge or device mapping one and compute corresponding pfn * values. */ @@ -675,7 +671,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, } /* - * We have handled all the valid case above ie either none, migration, + * We have handled all the valid cases above ie either none, migration, * huge or transparent huge. At this point either it is a valid pmd * entry pointing to pte directory or it is a bad pmd that will not * recover. @@ -795,10 +791,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, pte_t entry; int ret = 0; - size = 1UL << huge_page_shift(h); + size = huge_page_size(h); mask = size - 1; if (range->page_shift != PAGE_SHIFT) { - /* Make sure we are looking at full page. */ + /* Make sure we are looking at a full page. */ if (start & mask) return -EINVAL; if (end < (start + size)) @@ -809,8 +805,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, size = PAGE_SIZE; } - - ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); entry = huge_ptep_get(pte); i = (start - range->start) >> range->page_shift; @@ -859,7 +854,7 @@ static void hmm_pfns_clear(struct hmm_range *range, * @start: start virtual address (inclusive) * @end: end virtual address (exclusive) * @page_shift: expect page shift for the range - * Returns 0 on success, -EFAULT if the address space is no longer valid + * Return: 0 on success, -EFAULT if the address space is no longer valid * * Track updates to the CPU page table see include/linux/hmm.h */ -- cgit 1.2.3-korg From 9a4903e49e495bfd2650862dfae4178bebe4db9c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Jul 2019 17:56:46 -0700 Subject: mm/hmm: replace the block argument to hmm_range_fault with a flags value This allows easier expansion to other flags, and also makes the callers a little easier to read. Link: https://lore.kernel.org/r/20190726005650.2566-4-rcampbell@nvidia.com Signed-off-by: Christoph Hellwig Signed-off-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- include/linux/hmm.h | 11 ++++- mm/hmm.c | 74 ++++++++++++++++----------------- 4 files changed, 48 insertions(+), 41 deletions(-) (limited to 'mm/hmm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index e51b48ac48eb15..12a59ac83f7271 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -832,7 +832,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) down_read(&mm->mmap_sem); - r = hmm_range_fault(range, true); + r = hmm_range_fault(range, 0); if (unlikely(r < 0)) { if (likely(r == -EAGAIN)) { /* diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 79b29c918717cf..49b520c60fc5a6 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -505,7 +505,7 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) return -EBUSY; } - ret = hmm_range_fault(range, true); + ret = hmm_range_fault(range, 0); if (ret <= 0) { if (ret == 0) ret = -EBUSY; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4f870f7017cb52..89a141060e5bad 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -407,12 +407,19 @@ int hmm_range_register(struct hmm_range *range, unsigned long end, unsigned page_shift); void hmm_range_unregister(struct hmm_range *range); + +/* + * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case. + */ +#define HMM_FAULT_ALLOW_RETRY (1 << 0) + long hmm_range_snapshot(struct hmm_range *range); -long hmm_range_fault(struct hmm_range *range, bool block); +long hmm_range_fault(struct hmm_range *range, unsigned int flags); + long hmm_range_dma_map(struct hmm_range *range, struct device *device, dma_addr_t *daddrs, - bool block); + unsigned int flags); long hmm_range_dma_unmap(struct hmm_range *range, struct vm_area_struct *vma, struct device *device, diff --git a/mm/hmm.c b/mm/hmm.c index 362944b0fbca57..84f2791d351021 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -281,7 +281,7 @@ struct hmm_vma_walk { struct dev_pagemap *pgmap; unsigned long last; bool fault; - bool block; + unsigned int flags; }; static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, @@ -293,8 +293,11 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, struct vm_area_struct *vma = walk->vma; vm_fault_t ret; - flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; - flags |= write_fault ? FAULT_FLAG_WRITE : 0; + if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) + flags |= FAULT_FLAG_ALLOW_RETRY; + if (write_fault) + flags |= FAULT_FLAG_WRITE; + ret = handle_mm_fault(vma, addr, flags); if (ret & VM_FAULT_RETRY) { /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ @@ -1012,26 +1015,26 @@ long hmm_range_snapshot(struct hmm_range *range) } EXPORT_SYMBOL(hmm_range_snapshot); -/* - * hmm_range_fault() - try to fault some address in a virtual address range - * @range: range being faulted - * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Return: number of valid pages in range->pfns[] (from range start - * address). This may be zero. If the return value is negative, - * then one of the following values may be returned: +/** + * hmm_range_fault - try to fault some address in a virtual address range + * @range: range being faulted + * @flags: HMM_FAULT_* flags * - * -EINVAL invalid arguments or mm or virtual address are in an - * invalid vma (for instance device file vma). - * -ENOMEM: Out of memory. - * -EPERM: Invalid permission (for instance asking for write and - * range is read only). - * -EAGAIN: If you need to retry and mmap_sem was drop. This can only - * happens if block argument is false. - * -EBUSY: If the the range is being invalidated and you should wait - * for invalidation to finish. - * -EFAULT: Invalid (ie either no valid vma or it is illegal to access - * that range), number of valid pages in range->pfns[] (from - * range start address). + * Return: the number of valid pages in range->pfns[] (from range start + * address), which may be zero. On error one of the following status codes + * can be returned: + * + * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma + * (e.g., device file vma). + * -ENOMEM: Out of memory. + * -EPERM: Invalid permission (e.g., asking for write and range is read + * only). + * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. + * -EBUSY: The range has been invalidated and the caller needs to wait for + * the invalidation to finish. + * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access + * that range) number of valid pages in range->pfns[] (from + * range start address). * * This is similar to a regular CPU page fault except that it will not trigger * any memory migration if the memory being faulted is not accessible by CPUs @@ -1040,7 +1043,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); * On error, for one virtual address in the range, the function will mark the * corresponding HMM pfn entry with an error flag. */ -long hmm_range_fault(struct hmm_range *range, bool block) +long hmm_range_fault(struct hmm_range *range, unsigned int flags) { const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; unsigned long start = range->start, end; @@ -1086,7 +1089,7 @@ long hmm_range_fault(struct hmm_range *range, bool block) hmm_vma_walk.pgmap = NULL; hmm_vma_walk.last = start; hmm_vma_walk.fault = true; - hmm_vma_walk.block = block; + hmm_vma_walk.flags = flags; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; end = min(range->end, vma->vm_end); @@ -1125,25 +1128,22 @@ long hmm_range_fault(struct hmm_range *range, bool block) EXPORT_SYMBOL(hmm_range_fault); /** - * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. - * @range: range being faulted - * @device: device against to dma map page to - * @daddrs: dma address of mapped pages - * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been - * drop and you need to try again, some other error value otherwise + * hmm_range_dma_map - hmm_range_fault() and dma map page all in one. + * @range: range being faulted + * @device: device to map page to + * @daddrs: array of dma addresses for the mapped pages + * @flags: HMM_FAULT_* * - * Note same usage pattern as hmm_range_fault(). + * Return: the number of pages mapped on success (including zero), or any + * status return from hmm_range_fault() otherwise. */ -long hmm_range_dma_map(struct hmm_range *range, - struct device *device, - dma_addr_t *daddrs, - bool block) +long hmm_range_dma_map(struct hmm_range *range, struct device *device, + dma_addr_t *daddrs, unsigned int flags) { unsigned long i, npages, mapped; long ret; - ret = hmm_range_fault(range, block); + ret = hmm_range_fault(range, flags); if (ret <= 0) return ret ? ret : -EBUSY; -- cgit 1.2.3-korg From d45d464b118f428229d91769c8a3cc1e2e0bb4d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Jul 2019 17:56:47 -0700 Subject: mm/hmm: merge hmm_range_snapshot into hmm_range_fault Add a HMM_FAULT_SNAPSHOT flag so that hmm_range_snapshot can be merged into the almost identical hmm_range_fault function. Link: https://lore.kernel.org/r/20190726005650.2566-5-rcampbell@nvidia.com Signed-off-by: Christoph Hellwig Signed-off-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- Documentation/vm/hmm.rst | 17 +++++----- include/linux/hmm.h | 4 ++- mm/hmm.c | 85 ++---------------------------------------------- 3 files changed, 13 insertions(+), 93 deletions(-) (limited to 'mm/hmm.c') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 710ce1c701bf3a..ddcb5ca8b296d6 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -192,15 +192,14 @@ read only, or fully unmap, etc.). The device must complete the update before the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can -use either:: +use:: - long hmm_range_snapshot(struct hmm_range *range); - long hmm_range_fault(struct hmm_range *range, bool block); + long hmm_range_fault(struct hmm_range *range, unsigned int flags); -The first one (hmm_range_snapshot()) will only fetch present CPU page table +With the HMM_RANGE_SNAPSHOT flag, it will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. -The second one does trigger a page fault on missing or read-only entries if -write access is requested (see below). Page faults use the generic mm page +Without that flag, it does trigger a page fault on missing or read-only entries +if write access is requested (see below). Page faults use the generic mm page fault code path just like a CPU page fault. Both functions copy CPU page table entries into their pfns array argument. Each @@ -227,20 +226,20 @@ The usage pattern is:: /* * Just wait for range to be valid, safe to ignore return value as we - * will use the return value of hmm_range_snapshot() below under the + * will use the return value of hmm_range_fault() below under the * mmap_sem to ascertain the validity of the range. */ hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); again: down_read(&mm->mmap_sem); - ret = hmm_range_snapshot(&range); + ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT); if (ret) { up_read(&mm->mmap_sem); if (ret == -EBUSY) { /* * No need to check hmm_range_wait_until_valid() return value - * on retry we will get proper error with hmm_range_snapshot() + * on retry we will get proper error with hmm_range_fault() */ hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); goto again; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 89a141060e5bad..90dc5944b1bce6 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -413,7 +413,9 @@ void hmm_range_unregister(struct hmm_range *range); */ #define HMM_FAULT_ALLOW_RETRY (1 << 0) -long hmm_range_snapshot(struct hmm_range *range); +/* Don't fault in missing PTEs, just snapshot the current state. */ +#define HMM_FAULT_SNAPSHOT (1 << 1) + long hmm_range_fault(struct hmm_range *range, unsigned int flags); long hmm_range_dma_map(struct hmm_range *range, diff --git a/mm/hmm.c b/mm/hmm.c index 84f2791d351021..1bc014cddd78af 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -280,7 +280,6 @@ struct hmm_vma_walk { struct hmm_range *range; struct dev_pagemap *pgmap; unsigned long last; - bool fault; unsigned int flags; }; @@ -373,7 +372,7 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, { struct hmm_range *range = hmm_vma_walk->range; - if (!hmm_vma_walk->fault) + if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) return; /* @@ -418,7 +417,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, { unsigned long i; - if (!hmm_vma_walk->fault) { + if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { *fault = *write_fault = false; return; } @@ -936,85 +935,6 @@ void hmm_range_unregister(struct hmm_range *range) } EXPORT_SYMBOL(hmm_range_unregister); -/* - * hmm_range_snapshot() - snapshot CPU page table for a range - * @range: range - * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid - * permission (for instance asking for write and range is read only), - * -EBUSY if you need to retry, -EFAULT invalid (ie either no valid - * vma or it is illegal to access that range), number of valid pages - * in range->pfns[] (from range start address). - * - * This snapshots the CPU page table for a range of virtual addresses. Snapshot - * validity is tracked by range struct. See in include/linux/hmm.h for example - * on how to use. - */ -long hmm_range_snapshot(struct hmm_range *range) -{ - const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; - unsigned long start = range->start, end; - struct hmm_vma_walk hmm_vma_walk; - struct hmm *hmm = range->hmm; - struct vm_area_struct *vma; - struct mm_walk mm_walk; - - lockdep_assert_held(&hmm->mm->mmap_sem); - do { - /* If range is no longer valid force retry. */ - if (!range->valid) - return -EBUSY; - - vma = find_vma(hmm->mm, start); - if (vma == NULL || (vma->vm_flags & device_vma)) - return -EFAULT; - - if (is_vm_hugetlb_page(vma)) { - if (huge_page_shift(hstate_vma(vma)) != - range->page_shift && - range->page_shift != PAGE_SHIFT) - return -EINVAL; - } else { - if (range->page_shift != PAGE_SHIFT) - return -EINVAL; - } - - if (!(vma->vm_flags & VM_READ)) { - /* - * If vma do not allow read access, then assume that it - * does not allow write access, either. HMM does not - * support architecture that allow write without read. - */ - hmm_pfns_clear(range, range->pfns, - range->start, range->end); - return -EPERM; - } - - range->vma = vma; - hmm_vma_walk.pgmap = NULL; - hmm_vma_walk.last = start; - hmm_vma_walk.fault = false; - hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; - end = min(range->end, vma->vm_end); - - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pud_entry = hmm_vma_walk_pud; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; - mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; - - walk_page_range(start, end, &mm_walk); - start = end; - } while (start < range->end); - - return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; -} -EXPORT_SYMBOL(hmm_range_snapshot); - /** * hmm_range_fault - try to fault some address in a virtual address range * @range: range being faulted @@ -1088,7 +1008,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) range->vma = vma; hmm_vma_walk.pgmap = NULL; hmm_vma_walk.last = start; - hmm_vma_walk.fault = true; hmm_vma_walk.flags = flags; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; -- cgit 1.2.3-korg From f527688d5d8a80d2d1b2c02779105747c2f4f705 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 25 Jul 2019 17:56:49 -0700 Subject: mm/hmm: remove hugetlbfs check in hmm_vma_walk_pmd walk_page_range() will only call hmm_vma_walk_hugetlb_entry() for hugetlbfs pages and doesn't call hmm_vma_walk_pmd() in this case. Therefore, it is safe to remove the check for vma->vm_flags & VM_HUGETLB in hmm_vma_walk_pmd(). Link: https://lore.kernel.org/r/20190726005650.2566-7-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm/hmm.c') diff --git a/mm/hmm.c b/mm/hmm.c index 1bc014cddd78af..6111c0a3c12d73 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -630,9 +630,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, if (pmd_none(pmd)) return hmm_vma_walk_hole(start, end, walk); - if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB)) - return hmm_pfns_bad(start, end, walk); - if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { bool fault, write_fault; unsigned long npages; -- cgit 1.2.3-korg From cc374377a19d2a49d693997b62dc3a6f5fac6d61 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 25 Jul 2019 17:56:50 -0700 Subject: mm/hmm: remove hmm_range vma Since hmm_range_fault() doesn't use the struct hmm_range vma field, remove it. Link: https://lore.kernel.org/r/20190726005650.2566-8-rcampbell@nvidia.com Suggested-by: Jason Gunthorpe Signed-off-by: Ralph Campbell Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/nouveau/nouveau_svm.c | 7 +++---- include/linux/hmm.h | 1 - mm/hmm.c | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'mm/hmm.c') diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 49b520c60fc5a6..a74530b5a5236e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -496,12 +496,12 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) range->start, range->end, PAGE_SHIFT); if (ret) { - up_read(&range->vma->vm_mm->mmap_sem); + up_read(&range->hmm->mm->mmap_sem); return (int)ret; } if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { - up_read(&range->vma->vm_mm->mmap_sem); + up_read(&range->hmm->mm->mmap_sem); return -EBUSY; } @@ -509,7 +509,7 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) if (ret <= 0) { if (ret == 0) ret = -EBUSY; - up_read(&range->vma->vm_mm->mmap_sem); + up_read(&range->hmm->mm->mmap_sem); hmm_range_unregister(range); return ret; } @@ -682,7 +682,6 @@ nouveau_svm_fault(struct nvif_notify *notify) args.i.p.addr + args.i.p.size, fn - fi); /* Have HMM fault pages within the fault window to the GPU. */ - range.vma = vma; range.start = args.i.p.addr; range.end = args.i.p.addr + args.i.p.size; range.pfns = args.phys; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 90dc5944b1bce6..82265118d94abd 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -164,7 +164,6 @@ enum hmm_pfn_value_e { */ struct hmm_range { struct hmm *hmm; - struct vm_area_struct *vma; struct list_head list; unsigned long start; unsigned long end; diff --git a/mm/hmm.c b/mm/hmm.c index 6111c0a3c12d73..9a908902e4cc38 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1002,7 +1002,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) return -EPERM; } - range->vma = vma; hmm_vma_walk.pgmap = NULL; hmm_vma_walk.last = start; hmm_vma_walk.flags = flags; -- cgit 1.2.3-korg From 2cbeb41913e639890bf2c6485d7bdc6a25fdfd56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:43 +0300 Subject: mm/hmm: remove the unused vma argument to hmm_range_dma_unmap Link: https://lore.kernel.org/r/20190806160554.14046-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- include/linux/hmm.h | 1 - mm/hmm.c | 2 -- 2 files changed, 3 deletions(-) (limited to 'mm/hmm.c') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 82265118d94abd..59be0aa2476da6 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -422,7 +422,6 @@ long hmm_range_dma_map(struct hmm_range *range, dma_addr_t *daddrs, unsigned int flags); long hmm_range_dma_unmap(struct hmm_range *range, - struct vm_area_struct *vma, struct device *device, dma_addr_t *daddrs, bool dirty); diff --git a/mm/hmm.c b/mm/hmm.c index 9a908902e4cc38..160ecb47d04bc6 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1129,7 +1129,6 @@ EXPORT_SYMBOL(hmm_range_dma_map); /** * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() * @range: range being unmapped - * @vma: the vma against which the range (optional) * @device: device against which dma map was done * @daddrs: dma address of mapped pages * @dirty: dirty page if it had the write flag set @@ -1141,7 +1140,6 @@ EXPORT_SYMBOL(hmm_range_dma_map); * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. */ long hmm_range_dma_unmap(struct hmm_range *range, - struct vm_area_struct *vma, struct device *device, dma_addr_t *daddrs, bool dirty) -- cgit 1.2.3-korg From fac555ac93d453a0d2265eef88bf4c249dd63e07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:44 +0300 Subject: mm/hmm: remove superfluous arguments from hmm_range_register The start, end and page_shift values are all saved in the range structure, so we might as well use that for argument passing. Link: https://lore.kernel.org/r/20190806160554.14046-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Felix Kuehling Signed-off-by: Jason Gunthorpe --- Documentation/vm/hmm.rst | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 7 +++++-- drivers/gpu/drm/nouveau/nouveau_svm.c | 5 ++--- include/linux/hmm.h | 6 +----- mm/hmm.c | 20 +++++--------------- 5 files changed, 14 insertions(+), 26 deletions(-) (limited to 'mm/hmm.c') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index ddcb5ca8b296d6..e63c11f7e0e0d6 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -222,7 +222,7 @@ The usage pattern is:: range.flags = ...; range.values = ...; range.pfn_shift = ...; - hmm_range_register(&range); + hmm_range_register(&range, mirror); /* * Just wait for range to be valid, safe to ignore return value as we diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index f0821638bbc63f..71d6e7087b0b51 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -818,8 +818,11 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) 0 : range->flags[HMM_PFN_WRITE]; range->pfn_flags_mask = 0; range->pfns = pfns; - hmm_range_register(range, mirror, start, - start + ttm->num_pages * PAGE_SIZE, PAGE_SHIFT); + range->page_shift = PAGE_SHIFT; + range->start = start; + range->end = start + ttm->num_pages * PAGE_SIZE; + + hmm_range_register(range, mirror); /* * Just wait for range to be valid, safe to ignore return value as we diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 98072fd48cf722..41fad4719ac62f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -492,9 +492,7 @@ nouveau_range_fault(struct nouveau_svmm *svmm, struct hmm_range *range) range->default_flags = 0; range->pfn_flags_mask = -1UL; - ret = hmm_range_register(range, &svmm->mirror, - range->start, range->end, - PAGE_SHIFT); + ret = hmm_range_register(range, &svmm->mirror); if (ret) { up_read(&svmm->mm->mmap_sem); return (int)ret; @@ -682,6 +680,7 @@ nouveau_svm_fault(struct nvif_notify *notify) args.i.p.addr + args.i.p.size, fn - fi); /* Have HMM fault pages within the fault window to the GPU. */ + range.page_shift = PAGE_SHIFT; range.start = args.i.p.addr; range.end = args.i.p.addr + args.i.p.size; range.pfns = args.phys; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 59be0aa2476da6..c5b51376b4539f 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -400,11 +400,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); /* * Please see Documentation/vm/hmm.rst for how to use the range API. */ -int hmm_range_register(struct hmm_range *range, - struct hmm_mirror *mirror, - unsigned long start, - unsigned long end, - unsigned page_shift); +int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror); void hmm_range_unregister(struct hmm_range *range); /* diff --git a/mm/hmm.c b/mm/hmm.c index 160ecb47d04bc6..ab22fdb655c8e3 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -850,35 +850,25 @@ static void hmm_pfns_clear(struct hmm_range *range, * hmm_range_register() - start tracking change to CPU page table over a range * @range: range * @mm: the mm struct for the range of virtual address - * @start: start virtual address (inclusive) - * @end: end virtual address (exclusive) - * @page_shift: expect page shift for the range + * * Return: 0 on success, -EFAULT if the address space is no longer valid * * Track updates to the CPU page table see include/linux/hmm.h */ -int hmm_range_register(struct hmm_range *range, - struct hmm_mirror *mirror, - unsigned long start, - unsigned long end, - unsigned page_shift) +int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) { - unsigned long mask = ((1UL << page_shift) - 1UL); + unsigned long mask = ((1UL << range->page_shift) - 1UL); struct hmm *hmm = mirror->hmm; unsigned long flags; range->valid = false; range->hmm = NULL; - if ((start & mask) || (end & mask)) + if ((range->start & mask) || (range->end & mask)) return -EINVAL; - if (start >= end) + if (range->start >= range->end) return -EINVAL; - range->page_shift = page_shift; - range->start = start; - range->end = end; - /* Prevent hmm_release() from running while the range is valid */ if (!mmget_not_zero(hmm->mm)) return -EFAULT; -- cgit 1.2.3-korg From 7f08263d9bc6627382da14f9e81d643d0329d5d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:45 +0300 Subject: mm/hmm: remove the page_shift member from struct hmm_range All users pass PAGE_SIZE here, and if we wanted to support single entries for huge pages we should really just add a HMM_FAULT_HUGEPAGE flag instead that uses the huge page size instead of having the caller calculate that size once, just for the hmm code to verify it. Link: https://lore.kernel.org/r/20190806160554.14046-8-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 - drivers/gpu/drm/nouveau/nouveau_svm.c | 1 - include/linux/hmm.h | 22 ----------------- mm/hmm.c | 42 +++++++-------------------------- 4 files changed, 9 insertions(+), 57 deletions(-) (limited to 'mm/hmm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 71d6e7087b0b51..8bf79288c4e29b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -818,7 +818,6 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) 0 : range->flags[HMM_PFN_WRITE]; range->pfn_flags_mask = 0; range->pfns = pfns; - range->page_shift = PAGE_SHIFT; range->start = start; range->end = start + ttm->num_pages * PAGE_SIZE; diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 41fad4719ac62f..668d4bd0c118f1 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -680,7 +680,6 @@ nouveau_svm_fault(struct nvif_notify *notify) args.i.p.addr + args.i.p.size, fn - fi); /* Have HMM fault pages within the fault window to the GPU. */ - range.page_shift = PAGE_SHIFT; range.start = args.i.p.addr; range.end = args.i.p.addr + args.i.p.size; range.pfns = args.phys; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index c5b51376b4539f..51e18fbb895345 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -158,7 +158,6 @@ enum hmm_pfn_value_e { * @values: pfn value for some special case (none, special, error, ...) * @default_flags: default flags for the range (write, read, ... see hmm doc) * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter - * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT) * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) * @valid: pfns array did not change since it has been fill by an HMM function */ @@ -172,31 +171,10 @@ struct hmm_range { const uint64_t *values; uint64_t default_flags; uint64_t pfn_flags_mask; - uint8_t page_shift; uint8_t pfn_shift; bool valid; }; -/* - * hmm_range_page_shift() - return the page shift for the range - * @range: range being queried - * Return: page shift (page size = 1 << page shift) for the range - */ -static inline unsigned hmm_range_page_shift(const struct hmm_range *range) -{ - return range->page_shift; -} - -/* - * hmm_range_page_size() - return the page size for the range - * @range: range being queried - * Return: page size for the range in bytes - */ -static inline unsigned long hmm_range_page_size(const struct hmm_range *range) -{ - return 1UL << hmm_range_page_shift(range); -} - /* * hmm_range_wait_until_valid() - wait for range to be valid * @range: range affected by invalidation to wait on diff --git a/mm/hmm.c b/mm/hmm.c index ab22fdb655c8e3..9e0052e037fedf 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -345,13 +345,12 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; - unsigned long i, page_size; + unsigned long i; hmm_vma_walk->last = addr; - page_size = hmm_range_page_size(range); - i = (addr - range->start) >> range->page_shift; + i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += page_size, i++) { + for (; addr < end; addr += PAGE_SIZE, i++) { pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { int ret; @@ -779,7 +778,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE - unsigned long addr = start, i, pfn, mask, size, pfn_inc; + unsigned long addr = start, i, pfn, mask; struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; @@ -790,24 +789,12 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, pte_t entry; int ret = 0; - size = huge_page_size(h); - mask = size - 1; - if (range->page_shift != PAGE_SHIFT) { - /* Make sure we are looking at a full page. */ - if (start & mask) - return -EINVAL; - if (end < (start + size)) - return -EINVAL; - pfn_inc = size >> PAGE_SHIFT; - } else { - pfn_inc = 1; - size = PAGE_SIZE; - } + mask = huge_page_size(h) - 1; ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); entry = huge_ptep_get(pte); - i = (start - range->start) >> range->page_shift; + i = (start - range->start) >> PAGE_SHIFT; orig_pfn = range->pfns[i]; range->pfns[i] = range->values[HMM_PFN_NONE]; cpu_flags = pte_to_hmm_pfn_flags(range, entry); @@ -819,8 +806,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, goto unlock; } - pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); - for (; addr < end; addr += size, i++, pfn += pfn_inc) + pfn = pte_pfn(entry) + ((start & mask) >> PAGE_SHIFT); + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; hmm_vma_walk->last = end; @@ -857,14 +844,13 @@ static void hmm_pfns_clear(struct hmm_range *range, */ int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) { - unsigned long mask = ((1UL << range->page_shift) - 1UL); struct hmm *hmm = mirror->hmm; unsigned long flags; range->valid = false; range->hmm = NULL; - if ((range->start & mask) || (range->end & mask)) + if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1))) return -EINVAL; if (range->start >= range->end) return -EINVAL; @@ -971,16 +957,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) if (vma == NULL || (vma->vm_flags & device_vma)) return -EFAULT; - if (is_vm_hugetlb_page(vma)) { - if (huge_page_shift(hstate_vma(vma)) != - range->page_shift && - range->page_shift != PAGE_SHIFT) - return -EINVAL; - } else { - if (range->page_shift != PAGE_SHIFT) - return -EINVAL; - } - if (!(vma->vm_flags & VM_READ)) { /* * If vma do not allow read access, then assume that it -- cgit 1.2.3-korg From 05c23af4a1b34df5838ebab2da1d6f802cf5ece3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:46 +0300 Subject: mm/hmm: remove the mask variable in hmm_vma_walk_hugetlb_entry The pagewalk code already passes the value as the hmask parameter. Link: https://lore.kernel.org/r/20190806160554.14046-9-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm/hmm.c') diff --git a/mm/hmm.c b/mm/hmm.c index 9e0052e037fedf..4807f4b167368f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -778,19 +778,16 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE - unsigned long addr = start, i, pfn, mask; + unsigned long addr = start, i, pfn; struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - struct hstate *h = hstate_vma(vma); uint64_t orig_pfn, cpu_flags; bool fault, write_fault; spinlock_t *ptl; pte_t entry; int ret = 0; - mask = huge_page_size(h) - 1; - ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); entry = huge_ptep_get(pte); @@ -806,7 +803,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, goto unlock; } - pfn = pte_pfn(entry) + ((start & mask) >> PAGE_SHIFT); + pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); for (; addr < end; addr += PAGE_SIZE, i++, pfn++) range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; -- cgit 1.2.3-korg From 309f9a4f5e1a233d5df2101b9394ee689d9e463f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:47 +0300 Subject: mm/hmm: don't abuse pte_index() in hmm_vma_handle_pmd pte_index is an internal arch helper in various architectures, without consistent semantics. Open code that calculation of a PMD index based on the virtual address instead. Link: https://lore.kernel.org/r/20190806160554.14046-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hmm.c') diff --git a/mm/hmm.c b/mm/hmm.c index 4807f4b167368f..8d56a43426240c 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -486,7 +486,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, if (pmd_protnone(pmd) || fault || write_fault) return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); - pfn = pmd_pfn(pmd) + pte_index(addr); + pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { if (pmd_devmap(pmd)) { hmm_vma_walk->pgmap = get_dev_pagemap(pfn, -- cgit 1.2.3-korg From f0b3c45c8931fd7448a638557752f2743f76f51a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:48 +0300 Subject: mm/hmm: only define hmm_vma_walk_pud if needed We only need the special pud_entry walker if PUD-sized hugepages and pte mappings are supported, else the common pagewalk code will take care of the iteration. Not implementing this callback reduced the amount of code compiled for non-x86 platforms, and also fixes compile failures with other architectures when helpers like pud_pfn are not implemented. Link: https://lore.kernel.org/r/20190806160554.14046-11-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'mm/hmm.c') diff --git a/mm/hmm.c b/mm/hmm.c index 8d56a43426240c..fb1306409258ec 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -456,15 +456,6 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) range->flags[HMM_PFN_VALID]; } -static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) -{ - if (!pud_present(pud)) - return 0; - return pud_write(pud) ? range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_WRITE] : - range->flags[HMM_PFN_VALID]; -} - static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, @@ -705,10 +696,19 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, return 0; } -static int hmm_vma_walk_pud(pud_t *pudp, - unsigned long start, - unsigned long end, - struct mm_walk *walk) +#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) +{ + if (!pud_present(pud)) + return 0; + return pud_write(pud) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + +static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, + struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; @@ -772,6 +772,9 @@ static int hmm_vma_walk_pud(pud_t *pudp, return 0; } +#else +#define hmm_vma_walk_pud NULL +#endif static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long start, unsigned long end, -- cgit 1.2.3-korg From 9d3973d60f0abd3985229bd895f45e2b8974344c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:49 +0300 Subject: mm/hmm: cleanup the hmm_vma_handle_pmd stub Stub out the whole function when CONFIG_TRANSPARENT_HUGEPAGE is not set to make the function easier to read. Link: https://lore.kernel.org/r/20190806160554.14046-12-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'mm/hmm.c') diff --git a/mm/hmm.c b/mm/hmm.c index fb1306409258ec..900cb4837315cf 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -456,13 +456,10 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) range->flags[HMM_PFN_VALID]; } -static int hmm_vma_handle_pmd(struct mm_walk *walk, - unsigned long addr, - unsigned long end, - uint64_t *pfns, - pmd_t pmd) -{ #ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, + unsigned long end, uint64_t *pfns, pmd_t pmd) +{ struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; @@ -493,11 +490,12 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, } hmm_vma_walk->last = end; return 0; -#else - /* If THP is not enabled then we should never reach this code ! */ - return -EINVAL; -#endif } +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +/* stub to allow the code below to compile */ +int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, + unsigned long end, uint64_t *pfns, pmd_t pmd); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) { -- cgit 1.2.3-korg From 251bbe59b7a6362f4c417aa8af281d44f9edfa0e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:50 +0300 Subject: mm/hmm: cleanup the hmm_vma_walk_hugetlb_entry stub Stub out the whole function and assign NULL to the .hugetlb_entry method if CONFIG_HUGETLB_PAGE is not set, as the method won't ever be called in that case. Link: https://lore.kernel.org/r/20190806160554.14046-13-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/hmm.c') diff --git a/mm/hmm.c b/mm/hmm.c index 900cb4837315cf..ef553e664061fa 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -774,11 +774,11 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, #define hmm_vma_walk_pud NULL #endif +#ifdef CONFIG_HUGETLB_PAGE static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long start, unsigned long end, struct mm_walk *walk) { -#ifdef CONFIG_HUGETLB_PAGE unsigned long addr = start, i, pfn; struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; @@ -817,10 +817,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); return ret; -#else /* CONFIG_HUGETLB_PAGE */ - return -EINVAL; -#endif } +#else +#define hmm_vma_walk_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ static void hmm_pfns_clear(struct hmm_range *range, uint64_t *pfns, -- cgit 1.2.3-korg From c7d8b7824ff9de866a356e1892dbe9f191aa5d06 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Aug 2019 20:15:42 -0300 Subject: hmm: use mmu_notifier_get/put for 'struct hmm' This is a significant simplification, it eliminates all the remaining 'hmm' stuff in mm_struct, eliminates krefing along the critical notifier paths, and takes away all the ugly locking and abuse of page_table_lock. mmu_notifier_get() provides the single struct hmm per struct mm which eliminates mm->hmm. It also directly guarantees that no mmu_notifier op callback is callable while concurrent free is possible, this eliminates all the krefs inside the mmu_notifier callbacks. The remaining krefs in the range code were overly cautious, drivers are already not permitted to free the mirror while a range exists. Link: https://lore.kernel.org/r/20190806231548.25242-6-jgg@ziepe.ca Reviewed-by: Christoph Hellwig Reviewed-by: Ralph Campbell Tested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 + drivers/gpu/drm/nouveau/nouveau_drm.c | 3 + include/linux/hmm.h | 12 +--- include/linux/mm_types.h | 6 -- kernel/fork.c | 1 - mm/hmm.c | 121 ++++++++------------------------ 6 files changed, 34 insertions(+), 111 deletions(-) (limited to 'mm/hmm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index f2e8b4238efd49..abc2330882014b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "amdgpu.h" #include "amdgpu_irq.h" @@ -1464,6 +1465,7 @@ static void __exit amdgpu_exit(void) amdgpu_unregister_atpx_handler(); amdgpu_sync_fini(); amdgpu_fence_slab_fini(); + mmu_notifier_synchronize(); } module_init(amdgpu_init); diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 7c2fcaba42d6c3..a0e48a482452d7 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -1292,6 +1293,8 @@ nouveau_drm_exit(void) #ifdef CONFIG_NOUVEAU_PLATFORM_DRIVER platform_driver_unregister(&nouveau_platform_driver); #endif + if (IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM)) + mmu_notifier_synchronize(); } module_init(nouveau_drm_init); diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 51e18fbb895345..3fec513b9c00f1 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -84,15 +84,12 @@ * @notifiers: count of active mmu notifiers */ struct hmm { - struct mm_struct *mm; - struct kref kref; + struct mmu_notifier mmu_notifier; spinlock_t ranges_lock; struct list_head ranges; struct list_head mirrors; - struct mmu_notifier mmu_notifier; struct rw_semaphore mirrors_sem; wait_queue_head_t wq; - struct rcu_head rcu; long notifiers; }; @@ -409,13 +406,6 @@ long hmm_range_dma_unmap(struct hmm_range *range, */ #define HMM_RANGE_DEFAULT_TIMEOUT 1000 -/* Below are for HMM internal use only! Not to be used by device driver! */ -static inline void hmm_mm_init(struct mm_struct *mm) -{ - mm->hmm = NULL; -} -#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -static inline void hmm_mm_init(struct mm_struct *mm) {} #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ #endif /* LINUX_HMM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3a37a89eb7a7c3..525d25d93330f2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -25,7 +25,6 @@ struct address_space; struct mem_cgroup; -struct hmm; /* * Each physical page in the system has a struct page associated with @@ -502,11 +501,6 @@ struct mm_struct { atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; - -#ifdef CONFIG_HMM_MIRROR - /* HMM needs to track a few things per mm */ - struct hmm *hmm; -#endif } __randomize_layout; /* diff --git a/kernel/fork.c b/kernel/fork.c index d8ae0f1b414802..bd4a0762f12f3e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1007,7 +1007,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_owner(mm, p); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); - hmm_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; diff --git a/mm/hmm.c b/mm/hmm.c index ef553e664061fa..49eace16f9f8da 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -26,101 +26,37 @@ #include #include -static const struct mmu_notifier_ops hmm_mmu_notifier_ops; - -/** - * hmm_get_or_create - register HMM against an mm (HMM internal) - * - * @mm: mm struct to attach to - * Return: an HMM object, either by referencing the existing - * (per-process) object, or by creating a new one. - * - * This is not intended to be used directly by device drivers. If mm already - * has an HMM struct then it get a reference on it and returns it. Otherwise - * it allocates an HMM struct, initializes it, associate it with the mm and - * returns it. - */ -static struct hmm *hmm_get_or_create(struct mm_struct *mm) +static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm) { struct hmm *hmm; - lockdep_assert_held_write(&mm->mmap_sem); - - /* Abuse the page_table_lock to also protect mm->hmm. */ - spin_lock(&mm->page_table_lock); - hmm = mm->hmm; - if (mm->hmm && kref_get_unless_zero(&mm->hmm->kref)) - goto out_unlock; - spin_unlock(&mm->page_table_lock); - - hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); + hmm = kzalloc(sizeof(*hmm), GFP_KERNEL); if (!hmm) - return NULL; + return ERR_PTR(-ENOMEM); + init_waitqueue_head(&hmm->wq); INIT_LIST_HEAD(&hmm->mirrors); init_rwsem(&hmm->mirrors_sem); - hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); spin_lock_init(&hmm->ranges_lock); - kref_init(&hmm->kref); hmm->notifiers = 0; - hmm->mm = mm; - - hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; - if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { - kfree(hmm); - return NULL; - } - - mmgrab(hmm->mm); - - /* - * We hold the exclusive mmap_sem here so we know that mm->hmm is - * still NULL or 0 kref, and is safe to update. - */ - spin_lock(&mm->page_table_lock); - mm->hmm = hmm; - -out_unlock: - spin_unlock(&mm->page_table_lock); - return hmm; + return &hmm->mmu_notifier; } -static void hmm_free_rcu(struct rcu_head *rcu) +static void hmm_free_notifier(struct mmu_notifier *mn) { - struct hmm *hmm = container_of(rcu, struct hmm, rcu); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - mmdrop(hmm->mm); + WARN_ON(!list_empty(&hmm->ranges)); + WARN_ON(!list_empty(&hmm->mirrors)); kfree(hmm); } -static void hmm_free(struct kref *kref) -{ - struct hmm *hmm = container_of(kref, struct hmm, kref); - - spin_lock(&hmm->mm->page_table_lock); - if (hmm->mm->hmm == hmm) - hmm->mm->hmm = NULL; - spin_unlock(&hmm->mm->page_table_lock); - - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm); - mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu); -} - -static inline void hmm_put(struct hmm *hmm) -{ - kref_put(&hmm->kref, hmm_free); -} - static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; - /* Bail out if hmm is in the process of being freed */ - if (!kref_get_unless_zero(&hmm->kref)) - return; - /* * Since hmm_range_register() holds the mmget() lock hmm_release() is * prevented as long as a range exists. @@ -137,8 +73,6 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) mirror->ops->release(mirror); } up_read(&hmm->mirrors_sem); - - hmm_put(hmm); } static void notifiers_decrement(struct hmm *hmm) @@ -169,9 +103,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, unsigned long flags; int ret = 0; - if (!kref_get_unless_zero(&hmm->kref)) - return 0; - spin_lock_irqsave(&hmm->ranges_lock, flags); hmm->notifiers++; list_for_each_entry(range, &hmm->ranges, list) { @@ -206,7 +137,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, out: if (ret) notifiers_decrement(hmm); - hmm_put(hmm); return ret; } @@ -215,17 +145,15 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn, { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - if (!kref_get_unless_zero(&hmm->kref)) - return; - notifiers_decrement(hmm); - hmm_put(hmm); } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { .release = hmm_release, .invalidate_range_start = hmm_invalidate_range_start, .invalidate_range_end = hmm_invalidate_range_end, + .alloc_notifier = hmm_alloc_notifier, + .free_notifier = hmm_free_notifier, }; /* @@ -237,18 +165,27 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { * * To start mirroring a process address space, the device driver must register * an HMM mirror struct. + * + * The caller cannot unregister the hmm_mirror while any ranges are + * registered. + * + * Callers using this function must put a call to mmu_notifier_synchronize() + * in their module exit functions. */ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) { + struct mmu_notifier *mn; + lockdep_assert_held_write(&mm->mmap_sem); /* Sanity check */ if (!mm || !mirror || !mirror->ops) return -EINVAL; - mirror->hmm = hmm_get_or_create(mm); - if (!mirror->hmm) - return -ENOMEM; + mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm); + if (IS_ERR(mn)) + return PTR_ERR(mn); + mirror->hmm = container_of(mn, struct hmm, mmu_notifier); down_write(&mirror->hmm->mirrors_sem); list_add(&mirror->list, &mirror->hmm->mirrors); @@ -272,7 +209,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror) down_write(&hmm->mirrors_sem); list_del(&mirror->list); up_write(&hmm->mirrors_sem); - hmm_put(hmm); + mmu_notifier_put(&hmm->mmu_notifier); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -854,14 +791,13 @@ int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) return -EINVAL; /* Prevent hmm_release() from running while the range is valid */ - if (!mmget_not_zero(hmm->mm)) + if (!mmget_not_zero(hmm->mmu_notifier.mm)) return -EFAULT; /* Initialize range to track CPU page table updates. */ spin_lock_irqsave(&hmm->ranges_lock, flags); range->hmm = hmm; - kref_get(&hmm->kref); list_add(&range->list, &hmm->ranges); /* @@ -893,8 +829,7 @@ void hmm_range_unregister(struct hmm_range *range) spin_unlock_irqrestore(&hmm->ranges_lock, flags); /* Drop reference taken by hmm_range_register() */ - mmput(hmm->mm); - hmm_put(hmm); + mmput(hmm->mmu_notifier.mm); /* * The range is now invalid and the ref on the hmm is dropped, so @@ -944,14 +879,14 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) struct mm_walk mm_walk; int ret; - lockdep_assert_held(&hmm->mm->mmap_sem); + lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem); do { /* If range is no longer valid force retry. */ if (!range->valid) return -EBUSY; - vma = find_vma(hmm->mm, start); + vma = find_vma(hmm->mmu_notifier.mm, start); if (vma == NULL || (vma->vm_flags & device_vma)) return -EFAULT; -- cgit 1.2.3-korg From e3fe8e555dd05cf74168d18555c44320ed50a0e1 Mon Sep 17 00:00:00 2001 From: "Yang, Philip" Date: Thu, 15 Aug 2019 20:52:56 +0000 Subject: mm/hmm: fix hmm_range_fault()'s handling of swapped out pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hmm_range_fault() may return NULL pages because some of the pfns are equal to HMM_PFN_NONE. This happens randomly under memory pressure. The reason is during the swapped out page pte path, hmm_vma_handle_pte() doesn't update the fault variable from cpu_flags, so it failed to call hmm_vam_do_fault() to swap the page in. The fix is to call hmm_pte_need_fault() to update fault variable. Fixes: 74eee180b935 ("mm/hmm/mirror: device page fault handler") Link: https://lore.kernel.org/r/20190815205227.7949-1-Philip.Yang@amd.com Signed-off-by: Philip Yang Reviewed-by: "Jérôme Glisse" Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/hmm.c') diff --git a/mm/hmm.c b/mm/hmm.c index 49eace16f9f8da..fc05c8fe78b40b 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -469,6 +469,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, swp_entry_t entry = pte_to_swp_entry(pte); if (!non_swap_entry(entry)) { + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); if (fault || write_fault) goto fault; return 0; -- cgit 1.2.3-korg From 6c64f2bbe79cf3b770ac60ae79442322bd76d55e Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 23 Aug 2019 15:17:52 -0700 Subject: mm/hmm: hmm_range_fault() NULL pointer bug Although hmm_range_fault() calls find_vma() to make sure that a vma exists before calling walk_page_range(), hmm_vma_walk_hole() can still be called with walk->vma == NULL if the start and end address are not contained within the vma range. hmm_range_fault() /* calls find_vma() but no range check */ walk_page_range() /* calls find_vma(), sets walk->vma = NULL */ __walk_page_range() walk_pgd_range() walk_p4d_range() walk_pud_range() hmm_vma_walk_hole() hmm_vma_walk_hole_() hmm_vma_do_fault() handle_mm_fault(vma=0) Link: https://lore.kernel.org/r/20190823221753.2514-2-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'mm/hmm.c') diff --git a/mm/hmm.c b/mm/hmm.c index fc05c8fe78b40b..29371485fe944d 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -229,6 +229,9 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, struct vm_area_struct *vma = walk->vma; vm_fault_t ret; + if (!vma) + goto err; + if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) flags |= FAULT_FLAG_ALLOW_RETRY; if (write_fault) @@ -239,12 +242,14 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ return -EAGAIN; } - if (ret & VM_FAULT_ERROR) { - *pfn = range->values[HMM_PFN_ERROR]; - return -EFAULT; - } + if (ret & VM_FAULT_ERROR) + goto err; return -EBUSY; + +err: + *pfn = range->values[HMM_PFN_ERROR]; + return -EFAULT; } static int hmm_pfns_bad(unsigned long addr, -- cgit 1.2.3-korg From c18ce674d548c00faa6b7e760bacbaf1f39315f3 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 23 Aug 2019 15:17:53 -0700 Subject: mm/hmm: hmm_range_fault() infinite loop Normally, callers to handle_mm_fault() are supposed to check the vma->vm_flags first. hmm_range_fault() checks for VM_READ but doesn't check for VM_WRITE if the caller requests a page to be faulted in with write permission (via the hmm_range.pfns[] value). If the vma is write protected, this can result in an infinite loop: hmm_range_fault() walk_page_range() ... hmm_vma_walk_hole() hmm_vma_walk_hole_() hmm_vma_do_fault() handle_mm_fault(FAULT_FLAG_WRITE) /* returns VM_FAULT_WRITE */ /* returns -EBUSY */ /* returns -EBUSY */ /* returns -EBUSY */ /* loops on -EBUSY and range->valid */ Prevent this by checking for vma->vm_flags & VM_WRITE before calling handle_mm_fault(). Link: https://lore.kernel.org/r/20190823221753.2514-3-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/hmm.c') diff --git a/mm/hmm.c b/mm/hmm.c index 29371485fe944d..4882b83aeccb3e 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -292,6 +292,9 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, hmm_vma_walk->last = addr; i = (addr - range->start) >> PAGE_SHIFT; + if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) + return -EPERM; + for (; addr < end; addr += PAGE_SIZE, i++) { pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { -- cgit 1.2.3-korg From a520110e4a15ceb385304d9cab22bb51438f6080 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Aug 2019 16:19:53 +0200 Subject: mm: split out a new pagewalk.h header from mm.h Add a new header for the two handful of users of the walk_page_range / walk_page_vma interface instead of polluting all users of mm.h with it. Link: https://lore.kernel.org/r/20190828141955.22210-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Thomas Hellstrom Reviewed-by: Steven Price Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- arch/openrisc/kernel/dma.c | 1 + arch/powerpc/mm/book3s64/subpage_prot.c | 2 +- arch/s390/mm/gmap.c | 2 +- fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 46 ---------------------------- include/linux/pagewalk.h | 54 +++++++++++++++++++++++++++++++++ mm/hmm.c | 2 +- mm/madvise.c | 1 + mm/memcontrol.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 1 + mm/mincore.c | 2 +- mm/mprotect.c | 2 +- mm/pagewalk.c | 2 +- 14 files changed, 66 insertions(+), 55 deletions(-) create mode 100644 include/linux/pagewalk.h (limited to 'mm/hmm.c') diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index b41a79fcdbd93d..c7812e6effa2f5 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -16,6 +16,7 @@ */ #include +#include #include #include diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 9ba07e55c4896b..236f0a861ecc37 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 39c3a6e3d26218..cf80feae970df4 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include #include diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 731642e0f5a0fb..8857da830b860c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index 0334ca97c584d8..7cf955feb8235d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1430,54 +1430,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long start, unsigned long end); -/** - * mm_walk - callbacks for walk_page_range - * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry - * this handler should only handle pud_trans_huge() puds. - * the pmd_entry or pte_entry callbacks will be used for - * regular PUDs. - * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry - * this handler is required to be able to handle - * pmd_trans_huge() pmds. They may simply choose to - * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (4th-level) entry - * @pte_hole: if set, called for each hole at all levels - * @hugetlb_entry: if set, called for each hugetlb entry - * @test_walk: caller specific callback function to determine whether - * we walk over the current vma or not. Returning 0 - * value means "do page table walk over the current vma," - * and a negative one means "abort current page table walk - * right now." 1 means "skip the current vma." - * @mm: mm_struct representing the target process of page table walk - * @vma: vma currently walked (NULL if walking outside vmas) - * @private: private data for callbacks' usage - * - * (see the comment on walk_page_range() for more details) - */ -struct mm_walk { - int (*pud_entry)(pud_t *pud, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pmd_entry)(pmd_t *pmd, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pte_entry)(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pte_hole)(unsigned long addr, unsigned long next, - struct mm_walk *walk); - int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, - unsigned long addr, unsigned long next, - struct mm_walk *walk); - int (*test_walk)(unsigned long addr, unsigned long next, - struct mm_walk *walk); - struct mm_struct *mm; - struct vm_area_struct *vma; - void *private; -}; - struct mmu_notifier_range; -int walk_page_range(unsigned long addr, unsigned long end, - struct mm_walk *walk); -int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h new file mode 100644 index 00000000000000..df278a94086dba --- /dev/null +++ b/include/linux/pagewalk.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PAGEWALK_H +#define _LINUX_PAGEWALK_H + +#include + +/** + * mm_walk - callbacks for walk_page_range + * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry + * this handler should only handle pud_trans_huge() puds. + * the pmd_entry or pte_entry callbacks will be used for + * regular PUDs. + * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry + * this handler is required to be able to handle + * pmd_trans_huge() pmds. They may simply choose to + * split_huge_page() instead of handling it explicitly. + * @pte_entry: if set, called for each non-empty PTE (4th-level) entry + * @pte_hole: if set, called for each hole at all levels + * @hugetlb_entry: if set, called for each hugetlb entry + * @test_walk: caller specific callback function to determine whether + * we walk over the current vma or not. Returning 0 + * value means "do page table walk over the current vma," + * and a negative one means "abort current page table walk + * right now." 1 means "skip the current vma." + * @mm: mm_struct representing the target process of page table walk + * @vma: vma currently walked (NULL if walking outside vmas) + * @private: private data for callbacks' usage + * + * (see the comment on walk_page_range() for more details) + */ +struct mm_walk { + int (*pud_entry)(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pmd_entry)(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pte_entry)(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pte_hole)(unsigned long addr, unsigned long next, + struct mm_walk *walk); + int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk); + int (*test_walk)(unsigned long addr, unsigned long next, + struct mm_walk *walk); + struct mm_struct *mm; + struct vm_area_struct *vma; + void *private; +}; + +int walk_page_range(unsigned long addr, unsigned long end, + struct mm_walk *walk); +int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); + +#endif /* _LINUX_PAGEWALK_H */ diff --git a/mm/hmm.c b/mm/hmm.c index 4882b83aeccb3e..26916ff6c8df2d 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -8,7 +8,7 @@ * Refer to include/linux/hmm.h for information about heterogeneous memory * management or HMM for short. */ -#include +#include #include #include #include diff --git a/mm/madvise.c b/mm/madvise.c index 968df3aa069fda..80a78bb16782d5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6f5c0c517c497d..4c3af5d71ab113 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 65e0874fce1736..3a96def1e796dd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -68,7 +68,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include +#include #include #include #include diff --git a/mm/migrate.c b/mm/migrate.c index 962cb62c621fb4..c9c73a35aca76d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/mincore.c b/mm/mincore.c index 4fe91d4974362f..3b051b6ab3fec6 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -10,7 +10,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/mm/mprotect.c b/mm/mprotect.c index bf38dfbbb4b474..cc73318dbc2592 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -9,7 +9,7 @@ * (C) Copyright 2002 Red Hat Inc, All Rights Reserved */ -#include +#include #include #include #include diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c3084ff2569d2f..8a92a961a2ee4a 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -- cgit 1.2.3-korg From 7b86ac3371b70c3fd8fd95501719beb1faab719f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Aug 2019 16:19:54 +0200 Subject: pagewalk: separate function pointers from iterator data The mm_walk structure currently mixed data and code. Split out the operations vectors into a new mm_walk_ops structure, and while we are changing the API also declare the mm_walk structure inside the walk_page_range and walk_page_vma functions. Based on patch from Linus Torvalds. Link: https://lore.kernel.org/r/20190828141955.22210-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Thomas Hellstrom Reviewed-by: Steven Price Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- arch/openrisc/kernel/dma.c | 22 +++--- arch/powerpc/mm/book3s64/subpage_prot.c | 10 +-- arch/s390/mm/gmap.c | 33 ++++----- fs/proc/task_mmu.c | 78 ++++++++++---------- include/linux/pagewalk.h | 64 ++++++++++------- mm/hmm.c | 23 +++--- mm/madvise.c | 41 ++++------- mm/memcontrol.c | 23 +++--- mm/mempolicy.c | 15 ++-- mm/migrate.c | 23 +++--- mm/mincore.c | 15 ++-- mm/mprotect.c | 24 +++---- mm/pagewalk.c | 124 ++++++++++++++++++-------------- 13 files changed, 251 insertions(+), 244 deletions(-) (limited to 'mm/hmm.c') diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index c7812e6effa2f5..4d5b8bd1d79568 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -44,6 +44,10 @@ page_set_nocache(pte_t *pte, unsigned long addr, return 0; } +static const struct mm_walk_ops set_nocache_walk_ops = { + .pte_entry = page_set_nocache, +}; + static int page_clear_nocache(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) @@ -59,6 +63,10 @@ page_clear_nocache(pte_t *pte, unsigned long addr, return 0; } +static const struct mm_walk_ops clear_nocache_walk_ops = { + .pte_entry = page_clear_nocache, +}; + /* * Alloc "coherent" memory, which for OpenRISC means simply uncached. * @@ -81,10 +89,6 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, { unsigned long va; void *page; - struct mm_walk walk = { - .pte_entry = page_set_nocache, - .mm = &init_mm - }; page = alloc_pages_exact(size, gfp | __GFP_ZERO); if (!page) @@ -99,7 +103,8 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, * We need to iterate through the pages, clearing the dcache for * them and setting the cache-inhibit bit. */ - if (walk_page_range(va, va + size, &walk)) { + if (walk_page_range(&init_mm, va, va + size, &set_nocache_walk_ops, + NULL)) { free_pages_exact(page, size); return NULL; } @@ -112,13 +117,10 @@ arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { unsigned long va = (unsigned long)vaddr; - struct mm_walk walk = { - .pte_entry = page_clear_nocache, - .mm = &init_mm - }; /* walk_page_range shouldn't be able to fail here */ - WARN_ON(walk_page_range(va, va + size, &walk)); + WARN_ON(walk_page_range(&init_mm, va, va + size, + &clear_nocache_walk_ops, NULL)); free_pages_exact(vaddr, size); } diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 236f0a861ecc37..2ef24a53f4c916 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -139,14 +139,14 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, return 0; } +static const struct mm_walk_ops subpage_walk_ops = { + .pmd_entry = subpage_walk_pmd_entry, +}; + static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; - struct mm_walk subpage_proto_walk = { - .mm = mm, - .pmd_entry = subpage_walk_pmd_entry, - }; /* * We don't try too hard, we just mark all the vma in that range @@ -163,7 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, if (vma->vm_start >= (addr + len)) break; vma->vm_flags |= VM_NOHUGEPAGE; - walk_page_vma(vma, &subpage_proto_walk); + walk_page_vma(vma, &subpage_walk_ops, NULL); vma = vma->vm_next; } } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index cf80feae970df4..bd78d504fdade8 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2521,13 +2521,9 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start, return 0; } -static inline void zap_zero_pages(struct mm_struct *mm) -{ - struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; - - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); -} +static const struct mm_walk_ops zap_zero_walk_ops = { + .pmd_entry = __zap_zero_pages, +}; /* * switch on pgstes for its userspace process (for kvm) @@ -2546,7 +2542,7 @@ int s390_enable_sie(void) mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); - zap_zero_pages(mm); + walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); up_write(&mm->mmap_sem); return 0; } @@ -2589,12 +2585,13 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, return 0; } +static const struct mm_walk_ops enable_skey_walk_ops = { + .hugetlb_entry = __s390_enable_skey_hugetlb, + .pte_entry = __s390_enable_skey_pte, +}; + int s390_enable_skey(void) { - struct mm_walk walk = { - .hugetlb_entry = __s390_enable_skey_hugetlb, - .pte_entry = __s390_enable_skey_pte, - }; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int rc = 0; @@ -2614,8 +2611,7 @@ int s390_enable_skey(void) } mm->def_flags &= ~VM_MERGEABLE; - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); + walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); out_up: up_write(&mm->mmap_sem); @@ -2633,13 +2629,14 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr, return 0; } +static const struct mm_walk_ops reset_cmma_walk_ops = { + .pte_entry = __s390_reset_cmma, +}; + void s390_reset_cmma(struct mm_struct *mm) { - struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; - down_write(&mm->mmap_sem); - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); + walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); up_write(&mm->mmap_sem); } EXPORT_SYMBOL_GPL(s390_reset_cmma); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8857da830b860c..bf43d1d6005926 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -513,7 +513,9 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end, return 0; } -#endif +#else +#define smaps_pte_hole NULL +#endif /* CONFIG_SHMEM */ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) @@ -729,21 +731,24 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, } return 0; } +#else +#define smaps_hugetlb_range NULL #endif /* HUGETLB_PAGE */ +static const struct mm_walk_ops smaps_walk_ops = { + .pmd_entry = smaps_pte_range, + .hugetlb_entry = smaps_hugetlb_range, +}; + +static const struct mm_walk_ops smaps_shmem_walk_ops = { + .pmd_entry = smaps_pte_range, + .hugetlb_entry = smaps_hugetlb_range, + .pte_hole = smaps_pte_hole, +}; + static void smap_gather_stats(struct vm_area_struct *vma, struct mem_size_stats *mss) { - struct mm_walk smaps_walk = { - .pmd_entry = smaps_pte_range, -#ifdef CONFIG_HUGETLB_PAGE - .hugetlb_entry = smaps_hugetlb_range, -#endif - .mm = vma->vm_mm, - }; - - smaps_walk.private = mss; - #ifdef CONFIG_SHMEM /* In case of smaps_rollup, reset the value from previous vma */ mss->check_shmem_swap = false; @@ -765,12 +770,13 @@ static void smap_gather_stats(struct vm_area_struct *vma, mss->swap += shmem_swapped; } else { mss->check_shmem_swap = true; - smaps_walk.pte_hole = smaps_pte_hole; + walk_page_vma(vma, &smaps_shmem_walk_ops, mss); + return; } } #endif /* mmap_sem is held in m_start */ - walk_page_vma(vma, &smaps_walk); + walk_page_vma(vma, &smaps_walk_ops, mss); } #define SEQ_PUT_DEC(str, val) \ @@ -1118,6 +1124,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, return 0; } +static const struct mm_walk_ops clear_refs_walk_ops = { + .pmd_entry = clear_refs_pte_range, + .test_walk = clear_refs_test_walk, +}; + static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -1151,12 +1162,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, struct clear_refs_private cp = { .type = type, }; - struct mm_walk clear_refs_walk = { - .pmd_entry = clear_refs_pte_range, - .test_walk = clear_refs_test_walk, - .mm = mm, - .private = &cp, - }; if (type == CLEAR_REFS_MM_HIWATER_RSS) { if (down_write_killable(&mm->mmap_sem)) { @@ -1217,7 +1222,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } - walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); + walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops, + &cp); if (type == CLEAR_REFS_SOFT_DIRTY) mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb, 0, -1); @@ -1489,8 +1495,16 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, return err; } +#else +#define pagemap_hugetlb_range NULL #endif /* HUGETLB_PAGE */ +static const struct mm_walk_ops pagemap_ops = { + .pmd_entry = pagemap_pmd_range, + .pte_hole = pagemap_pte_hole, + .hugetlb_entry = pagemap_hugetlb_range, +}; + /* * /proc/pid/pagemap - an array mapping virtual pages to pfns * @@ -1522,7 +1536,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, { struct mm_struct *mm = file->private_data; struct pagemapread pm; - struct mm_walk pagemap_walk = {}; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; @@ -1550,14 +1563,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!pm.buffer) goto out_mm; - pagemap_walk.pmd_entry = pagemap_pmd_range; - pagemap_walk.pte_hole = pagemap_pte_hole; -#ifdef CONFIG_HUGETLB_PAGE - pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; -#endif - pagemap_walk.mm = mm; - pagemap_walk.private = ± - src = *ppos; svpfn = src / PM_ENTRY_BYTES; start_vaddr = svpfn << PAGE_SHIFT; @@ -1586,7 +1591,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, ret = down_read_killable(&mm->mmap_sem); if (ret) goto out_free; - ret = walk_page_range(start_vaddr, end, &pagemap_walk); + ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); up_read(&mm->mmap_sem); start_vaddr = end; @@ -1798,6 +1803,11 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, } #endif +static const struct mm_walk_ops show_numa_ops = { + .hugetlb_entry = gather_hugetlb_stats, + .pmd_entry = gather_pte_stats, +}; + /* * Display pages allocated per node and memory policy via /proc. */ @@ -1809,12 +1819,6 @@ static int show_numa_map(struct seq_file *m, void *v) struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct mm_walk walk = { - .hugetlb_entry = gather_hugetlb_stats, - .pmd_entry = gather_pte_stats, - .private = md, - .mm = mm, - }; struct mempolicy *pol; char buffer[64]; int nid; @@ -1848,7 +1852,7 @@ static int show_numa_map(struct seq_file *m, void *v) seq_puts(m, " huge"); /* mmap_sem is held by m_start */ - walk_page_vma(vma, &walk); + walk_page_vma(vma, &show_numa_ops, md); if (!md->pages) goto out; diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index df278a94086dba..bddd9759bab936 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -4,31 +4,28 @@ #include +struct mm_walk; + /** - * mm_walk - callbacks for walk_page_range - * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry - * this handler should only handle pud_trans_huge() puds. - * the pmd_entry or pte_entry callbacks will be used for - * regular PUDs. - * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry - * this handler is required to be able to handle - * pmd_trans_huge() pmds. They may simply choose to - * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (4th-level) entry - * @pte_hole: if set, called for each hole at all levels - * @hugetlb_entry: if set, called for each hugetlb entry - * @test_walk: caller specific callback function to determine whether - * we walk over the current vma or not. Returning 0 - * value means "do page table walk over the current vma," - * and a negative one means "abort current page table walk - * right now." 1 means "skip the current vma." - * @mm: mm_struct representing the target process of page table walk - * @vma: vma currently walked (NULL if walking outside vmas) - * @private: private data for callbacks' usage - * - * (see the comment on walk_page_range() for more details) + * mm_walk_ops - callbacks for walk_page_range + * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry + * this handler should only handle pud_trans_huge() puds. + * the pmd_entry or pte_entry callbacks will be used for + * regular PUDs. + * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry + * this handler is required to be able to handle + * pmd_trans_huge() pmds. They may simply choose to + * split_huge_page() instead of handling it explicitly. + * @pte_entry: if set, called for each non-empty PTE (4th-level) entry + * @pte_hole: if set, called for each hole at all levels + * @hugetlb_entry: if set, called for each hugetlb entry + * @test_walk: caller specific callback function to determine whether + * we walk over the current vma or not. Returning 0 means + * "do page table walk over the current vma", returning + * a negative value means "abort current page table walk + * right now" and returning 1 means "skip the current vma" */ -struct mm_walk { +struct mm_walk_ops { int (*pud_entry)(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk); int (*pmd_entry)(pmd_t *pmd, unsigned long addr, @@ -42,13 +39,28 @@ struct mm_walk { struct mm_walk *walk); int (*test_walk)(unsigned long addr, unsigned long next, struct mm_walk *walk); +}; + +/** + * mm_walk - walk_page_range data + * @ops: operation to call during the walk + * @mm: mm_struct representing the target process of page table walk + * @vma: vma currently walked (NULL if walking outside vmas) + * @private: private data for callbacks' usage + * + * (see the comment on walk_page_range() for more details) + */ +struct mm_walk { + const struct mm_walk_ops *ops; struct mm_struct *mm; struct vm_area_struct *vma; void *private; }; -int walk_page_range(unsigned long addr, unsigned long end, - struct mm_walk *walk); -int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); +int walk_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); +int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, + void *private); #endif /* _LINUX_PAGEWALK_H */ diff --git a/mm/hmm.c b/mm/hmm.c index 26916ff6c8df2d..902f5fa6bf93ad 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -852,6 +852,13 @@ void hmm_range_unregister(struct hmm_range *range) } EXPORT_SYMBOL(hmm_range_unregister); +static const struct mm_walk_ops hmm_walk_ops = { + .pud_entry = hmm_vma_walk_pud, + .pmd_entry = hmm_vma_walk_pmd, + .pte_hole = hmm_vma_walk_hole, + .hugetlb_entry = hmm_vma_walk_hugetlb_entry, +}; + /** * hmm_range_fault - try to fault some address in a virtual address range * @range: range being faulted @@ -887,7 +894,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) struct hmm_vma_walk hmm_vma_walk; struct hmm *hmm = range->hmm; struct vm_area_struct *vma; - struct mm_walk mm_walk; int ret; lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem); @@ -916,21 +922,14 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) hmm_vma_walk.last = start; hmm_vma_walk.flags = flags; hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; end = min(range->end, vma->vm_end); - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pud_entry = hmm_vma_walk_pud; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; - mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; + walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops, + &hmm_vma_walk); do { - ret = walk_page_range(start, end, &mm_walk); + ret = walk_page_range(vma->vm_mm, start, end, + &hmm_walk_ops, &hmm_vma_walk); start = hmm_vma_walk.last; /* Keep trying while the range is valid. */ diff --git a/mm/madvise.c b/mm/madvise.c index 80a78bb16782d5..afe2b015ea58a3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -226,19 +226,9 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, return 0; } -static void force_swapin_readahead(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - struct mm_walk walk = { - .mm = vma->vm_mm, - .pmd_entry = swapin_walk_pmd_entry, - .private = vma, - }; - - walk_page_range(start, end, &walk); - - lru_add_drain(); /* Push any new pages onto the LRU now */ -} +static const struct mm_walk_ops swapin_walk_ops = { + .pmd_entry = swapin_walk_pmd_entry, +}; static void force_shm_swapin_readahead(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -280,7 +270,8 @@ static long madvise_willneed(struct vm_area_struct *vma, *prev = vma; #ifdef CONFIG_SWAP if (!file) { - force_swapin_readahead(vma, start, end); + walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); + lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } @@ -441,20 +432,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -static void madvise_free_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - struct mm_walk free_walk = { - .pmd_entry = madvise_free_pte_range, - .mm = vma->vm_mm, - .private = tlb, - }; - - tlb_start_vma(tlb, vma); - walk_page_range(addr, end, &free_walk); - tlb_end_vma(tlb, vma); -} +static const struct mm_walk_ops madvise_free_walk_ops = { + .pmd_entry = madvise_free_pte_range, +}; static int madvise_free_single_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) @@ -481,7 +461,10 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); - madvise_free_page_range(&tlb, vma, range.start, range.end); + tlb_start_vma(&tlb, vma); + walk_page_range(vma->vm_mm, range.start, range.end, + &madvise_free_walk_ops, &tlb); + tlb_end_vma(&tlb, vma); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb, range.start, range.end); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4c3af5d71ab113..9b2516a76be234 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5283,17 +5283,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, return 0; } +static const struct mm_walk_ops precharge_walk_ops = { + .pmd_entry = mem_cgroup_count_precharge_pte_range, +}; + static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) { unsigned long precharge; - struct mm_walk mem_cgroup_count_precharge_walk = { - .pmd_entry = mem_cgroup_count_precharge_pte_range, - .mm = mm, - }; down_read(&mm->mmap_sem); - walk_page_range(0, mm->highest_vm_end, - &mem_cgroup_count_precharge_walk); + walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); up_read(&mm->mmap_sem); precharge = mc.precharge; @@ -5562,13 +5561,12 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, return ret; } +static const struct mm_walk_ops charge_walk_ops = { + .pmd_entry = mem_cgroup_move_charge_pte_range, +}; + static void mem_cgroup_move_charge(void) { - struct mm_walk mem_cgroup_move_charge_walk = { - .pmd_entry = mem_cgroup_move_charge_pte_range, - .mm = mc.mm, - }; - lru_add_drain_all(); /* * Signal lock_page_memcg() to take the memcg's move_lock @@ -5594,7 +5592,8 @@ static void mem_cgroup_move_charge(void) * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); + walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, + NULL); up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3a96def1e796dd..f000771558d882 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -655,6 +655,12 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, return 1; } +static const struct mm_walk_ops queue_pages_walk_ops = { + .hugetlb_entry = queue_pages_hugetlb, + .pmd_entry = queue_pages_pte_range, + .test_walk = queue_pages_test_walk, +}; + /* * Walk through page tables and collect pages to be migrated. * @@ -679,15 +685,8 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, .nmask = nodes, .prev = NULL, }; - struct mm_walk queue_pages_walk = { - .hugetlb_entry = queue_pages_hugetlb, - .pmd_entry = queue_pages_pte_range, - .test_walk = queue_pages_test_walk, - .mm = mm, - .private = &qp, - }; - return walk_page_range(start, end, &queue_pages_walk); + return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); } /* diff --git a/mm/migrate.c b/mm/migrate.c index c9c73a35aca76d..9f4ed4e985c1fe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2320,6 +2320,11 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, return 0; } +static const struct mm_walk_ops migrate_vma_walk_ops = { + .pmd_entry = migrate_vma_collect_pmd, + .pte_hole = migrate_vma_collect_hole, +}; + /* * migrate_vma_collect() - collect pages over a range of virtual addresses * @migrate: migrate struct containing all migration information @@ -2331,21 +2336,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, static void migrate_vma_collect(struct migrate_vma *migrate) { struct mmu_notifier_range range; - struct mm_walk mm_walk = { - .pmd_entry = migrate_vma_collect_pmd, - .pte_hole = migrate_vma_collect_hole, - .vma = migrate->vma, - .mm = migrate->vma->vm_mm, - .private = migrate, - }; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm, - migrate->start, - migrate->end); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, + migrate->vma->vm_mm, migrate->start, migrate->end); mmu_notifier_invalidate_range_start(&range); - walk_page_range(migrate->start, migrate->end, &mm_walk); - mmu_notifier_invalidate_range_end(&range); + walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, + &migrate_vma_walk_ops, migrate); + + mmu_notifier_invalidate_range_end(&range); migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); } diff --git a/mm/mincore.c b/mm/mincore.c index 3b051b6ab3fec6..f9a9dbe8cd3301 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -193,6 +193,12 @@ static inline bool can_do_mincore(struct vm_area_struct *vma) inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; } +static const struct mm_walk_ops mincore_walk_ops = { + .pmd_entry = mincore_pte_range, + .pte_hole = mincore_unmapped_range, + .hugetlb_entry = mincore_hugetlb, +}; + /* * Do a chunk of "sys_mincore()". We've already checked * all the arguments, we hold the mmap semaphore: we should @@ -203,12 +209,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v struct vm_area_struct *vma; unsigned long end; int err; - struct mm_walk mincore_walk = { - .pmd_entry = mincore_pte_range, - .pte_hole = mincore_unmapped_range, - .hugetlb_entry = mincore_hugetlb, - .private = vec, - }; vma = find_vma(current->mm, addr); if (!vma || addr < vma->vm_start) @@ -219,8 +219,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v memset(vec, 1, pages); return pages; } - mincore_walk.mm = vma->vm_mm; - err = walk_page_range(addr, end, &mincore_walk); + err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec); if (err < 0) return err; return (end - addr) >> PAGE_SHIFT; diff --git a/mm/mprotect.c b/mm/mprotect.c index cc73318dbc2592..675e5d34a50773 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -329,20 +329,11 @@ static int prot_none_test(unsigned long addr, unsigned long next, return 0; } -static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, - unsigned long end, unsigned long newflags) -{ - pgprot_t new_pgprot = vm_get_page_prot(newflags); - struct mm_walk prot_none_walk = { - .pte_entry = prot_none_pte_entry, - .hugetlb_entry = prot_none_hugetlb_entry, - .test_walk = prot_none_test, - .mm = current->mm, - .private = &new_pgprot, - }; - - return walk_page_range(start, end, &prot_none_walk); -} +static const struct mm_walk_ops prot_none_walk_ops = { + .pte_entry = prot_none_pte_entry, + .hugetlb_entry = prot_none_hugetlb_entry, + .test_walk = prot_none_test, +}; int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, @@ -369,7 +360,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, if (arch_has_pfn_modify_check() && (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { - error = prot_none_walk(vma, start, end, newflags); + pgprot_t new_pgprot = vm_get_page_prot(newflags); + + error = walk_page_range(current->mm, start, end, + &prot_none_walk_ops, &new_pgprot); if (error) return error; } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8a92a961a2ee4a..b8762b673a3d4c 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -9,10 +9,11 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, { pte_t *pte; int err = 0; + const struct mm_walk_ops *ops = walk->ops; pte = pte_offset_map(pmd, addr); for (;;) { - err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); + err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) break; addr += PAGE_SIZE; @@ -30,6 +31,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, { pmd_t *pmd; unsigned long next; + const struct mm_walk_ops *ops = walk->ops; int err = 0; pmd = pmd_offset(pud, addr); @@ -37,8 +39,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, again: next = pmd_addr_end(addr, end); if (pmd_none(*pmd) || !walk->vma) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + if (ops->pte_hole) + err = ops->pte_hole(addr, next, walk); if (err) break; continue; @@ -47,8 +49,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, * This implies that each ->pmd_entry() handler * needs to know about pmd_trans_huge() pmds */ - if (walk->pmd_entry) - err = walk->pmd_entry(pmd, addr, next, walk); + if (ops->pmd_entry) + err = ops->pmd_entry(pmd, addr, next, walk); if (err) break; @@ -56,7 +58,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, * Check this here so we only break down trans_huge * pages when we _need_ to */ - if (!walk->pte_entry) + if (!ops->pte_entry) continue; split_huge_pmd(walk->vma, pmd, addr); @@ -75,6 +77,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, { pud_t *pud; unsigned long next; + const struct mm_walk_ops *ops = walk->ops; int err = 0; pud = pud_offset(p4d, addr); @@ -82,18 +85,18 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, again: next = pud_addr_end(addr, end); if (pud_none(*pud) || !walk->vma) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + if (ops->pte_hole) + err = ops->pte_hole(addr, next, walk); if (err) break; continue; } - if (walk->pud_entry) { + if (ops->pud_entry) { spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); if (ptl) { - err = walk->pud_entry(pud, addr, next, walk); + err = ops->pud_entry(pud, addr, next, walk); spin_unlock(ptl); if (err) break; @@ -105,7 +108,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, if (pud_none(*pud)) goto again; - if (walk->pmd_entry || walk->pte_entry) + if (ops->pmd_entry || ops->pte_entry) err = walk_pmd_range(pud, addr, next, walk); if (err) break; @@ -119,19 +122,20 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, { p4d_t *p4d; unsigned long next; + const struct mm_walk_ops *ops = walk->ops; int err = 0; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + if (ops->pte_hole) + err = ops->pte_hole(addr, next, walk); if (err) break; continue; } - if (walk->pmd_entry || walk->pte_entry) + if (ops->pmd_entry || ops->pte_entry) err = walk_pud_range(p4d, addr, next, walk); if (err) break; @@ -145,19 +149,20 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, { pgd_t *pgd; unsigned long next; + const struct mm_walk_ops *ops = walk->ops; int err = 0; pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + if (ops->pte_hole) + err = ops->pte_hole(addr, next, walk); if (err) break; continue; } - if (walk->pmd_entry || walk->pte_entry) + if (ops->pmd_entry || ops->pte_entry) err = walk_p4d_range(pgd, addr, next, walk); if (err) break; @@ -183,6 +188,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, unsigned long hmask = huge_page_mask(h); unsigned long sz = huge_page_size(h); pte_t *pte; + const struct mm_walk_ops *ops = walk->ops; int err = 0; do { @@ -190,9 +196,9 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, pte = huge_pte_offset(walk->mm, addr & hmask, sz); if (pte) - err = walk->hugetlb_entry(pte, hmask, addr, next, walk); - else if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + err = ops->hugetlb_entry(pte, hmask, addr, next, walk); + else if (ops->pte_hole) + err = ops->pte_hole(addr, next, walk); if (err) break; @@ -220,9 +226,10 @@ static int walk_page_test(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; + const struct mm_walk_ops *ops = walk->ops; - if (walk->test_walk) - return walk->test_walk(start, end, walk); + if (ops->test_walk) + return ops->test_walk(start, end, walk); /* * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP @@ -234,8 +241,8 @@ static int walk_page_test(unsigned long start, unsigned long end, */ if (vma->vm_flags & VM_PFNMAP) { int err = 1; - if (walk->pte_hole) - err = walk->pte_hole(start, end, walk); + if (ops->pte_hole) + err = ops->pte_hole(start, end, walk); return err ? err : 1; } return 0; @@ -248,7 +255,7 @@ static int __walk_page_range(unsigned long start, unsigned long end, struct vm_area_struct *vma = walk->vma; if (vma && is_vm_hugetlb_page(vma)) { - if (walk->hugetlb_entry) + if (walk->ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else err = walk_pgd_range(start, end, walk); @@ -258,11 +265,13 @@ static int __walk_page_range(unsigned long start, unsigned long end, /** * walk_page_range - walk page table with caller specific callbacks - * @start: start address of the virtual address range - * @end: end address of the virtual address range - * @walk: mm_walk structure defining the callbacks and the target address space + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @private: private data for callbacks' usage * - * Recursively walk the page table tree of the process represented by @walk->mm + * Recursively walk the page table tree of the process represented by @mm * within the virtual address range [@start, @end). During walking, we can do * some caller-specific works for each entry, by setting up pmd_entry(), * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these @@ -278,47 +287,52 @@ static int __walk_page_range(unsigned long start, unsigned long end, * * Before starting to walk page table, some callers want to check whether * they really want to walk over the current vma, typically by checking - * its vm_flags. walk_page_test() and @walk->test_walk() are used for this + * its vm_flags. walk_page_test() and @ops->test_walk() are used for this * purpose. * * struct mm_walk keeps current values of some common data like vma and pmd, * which are useful for the access from callbacks. If you want to pass some - * caller-specific data to callbacks, @walk->private should be helpful. + * caller-specific data to callbacks, @private should be helpful. * * Locking: - * Callers of walk_page_range() and walk_page_vma() should hold - * @walk->mm->mmap_sem, because these function traverse vma list and/or - * access to vma's data. + * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem, + * because these function traverse vma list and/or access to vma's data. */ -int walk_page_range(unsigned long start, unsigned long end, - struct mm_walk *walk) +int walk_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) { int err = 0; unsigned long next; struct vm_area_struct *vma; + struct mm_walk walk = { + .ops = ops, + .mm = mm, + .private = private, + }; if (start >= end) return -EINVAL; - if (!walk->mm) + if (!walk.mm) return -EINVAL; - VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); + VM_BUG_ON_MM(!rwsem_is_locked(&walk.mm->mmap_sem), walk.mm); - vma = find_vma(walk->mm, start); + vma = find_vma(walk.mm, start); do { if (!vma) { /* after the last vma */ - walk->vma = NULL; + walk.vma = NULL; next = end; } else if (start < vma->vm_start) { /* outside vma */ - walk->vma = NULL; + walk.vma = NULL; next = min(end, vma->vm_start); } else { /* inside vma */ - walk->vma = vma; + walk.vma = vma; next = min(end, vma->vm_end); vma = vma->vm_next; - err = walk_page_test(start, next, walk); + err = walk_page_test(start, next, &walk); if (err > 0) { /* * positive return values are purely for @@ -331,28 +345,34 @@ int walk_page_range(unsigned long start, unsigned long end, if (err < 0) break; } - if (walk->vma || walk->pte_hole) - err = __walk_page_range(start, next, walk); + if (walk.vma || walk.ops->pte_hole) + err = __walk_page_range(start, next, &walk); if (err) break; } while (start = next, start < end); return err; } -int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) +int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, + void *private) { + struct mm_walk walk = { + .ops = ops, + .mm = vma->vm_mm, + .vma = vma, + .private = private, + }; int err; - if (!walk->mm) + if (!walk.mm) return -EINVAL; - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); - VM_BUG_ON(!vma); - walk->vma = vma; - err = walk_page_test(vma->vm_start, vma->vm_end, walk); + VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); + + err = walk_page_test(vma->vm_start, vma->vm_end, &walk); if (err > 0) return 0; if (err < 0) return err; - return __walk_page_range(vma->vm_start, vma->vm_end, walk); + return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } -- cgit 1.2.3-korg