From 66f0dc481e5b802ab363b979fc1753410c7d82b5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 30 Dec 2009 20:17:34 +0000 Subject: mm: move sys_mmap_pgoff from util.c Move sys_mmap_pgoff() from mm/util.c to mm/mmap.c and mm/nommu.c, where we'd expect to find such code: especially now that it contains the MAP_HUGETLB handling. Revert mm/util.c to how it was in 2.6.32. This patch just ignores MAP_HUGETLB in the nommu case, as in 2.6.32, whereas 2.6.33-rc2 reported -ENOSYS. Perhaps validate_mmap_request() should reject it with -EINVAL? Add that later if necessary. Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/nommu.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'mm/nommu.c') diff --git a/mm/nommu.c b/mm/nommu.c index 8687973462bbea..6f9248f89bdefb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1398,6 +1398,31 @@ unsigned long do_mmap_pgoff(struct file *file, } EXPORT_SYMBOL(do_mmap_pgoff); +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} + /* * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. -- cgit 1.2.3-korg From cfe79c00a2f4f687eed8b7534d1d3d3d35540c29 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 6 Jan 2010 17:23:23 +0000 Subject: NOMMU: Avoiding duplicate icache flushes of shared maps When working with FDPIC, there are many shared mappings of read-only code regions between applications (the C library, applet packages like busybox, etc.), but the current do_mmap_pgoff() function will issue an icache flush whenever a VMA is added to an MM instead of only doing it when the map is initially created. The flush can instead be done when a region is first mmapped PROT_EXEC. Note that we may not rely on the first mapping of a region being executable - it's possible for it to be PROT_READ only, so we have to remember whether we've flushed the region or not, and then flush the entire region when a bit of it is made executable. However, this also affects the brk area. That will no longer be executable. We can mprotect() it to PROT_EXEC on MPU-mode kernels, but for NOMMU mode kernels, when it increases the brk allocation, making sys_brk() flush the extra from the icache should suffice. The brk area probably isn't used by NOMMU programs since the brk area can only use up the leavings from the stack allocation, where the stack allocation is larger than requested. Signed-off-by: David Howells Signed-off-by: Mike Frysinger Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ mm/nommu.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'mm/nommu.c') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 84a524afb3dcdf..84d020bed0830e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -123,6 +123,8 @@ struct vm_region { struct file *vm_file; /* the backing file or NULL */ atomic_t vm_usage; /* region usage count */ + bool vm_icache_flushed : 1; /* true if the icache has been flushed for + * this region */ }; /* diff --git a/mm/nommu.c b/mm/nommu.c index 6f9248f89bdefb..a8d17521624ab5 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Ok, looks good - let it rip. */ + flush_icache_range(mm->brk, brk); return mm->brk = brk; } @@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file, share: add_vma_to_mm(current->mm, vma); - up_write(&nommu_region_sem); + /* we flush the region from the icache only when the first executable + * mapping of it is made */ + if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { + flush_icache_range(region->vm_start, region->vm_end); + region->vm_icache_flushed = true; + } - if (prot & PROT_EXEC) - flush_icache_range(result, result + len); + up_write(&nommu_region_sem); kleave(" = %lx", result); return result; -- cgit 1.2.3-korg From 7959722b951cffcd61a0a35229d007deeed8c2dd Mon Sep 17 00:00:00 2001 From: Jie Zhang Date: Wed, 6 Jan 2010 17:23:28 +0000 Subject: NOMMU: Use copy_*_user_page() in access_process_vm() The MMU code uses the copy_*_user_page() variants in access_process_vm() rather than copy_*_user() as the former includes an icache flush. This is important when doing things like setting software breakpoints with gdb. So switch the NOMMU code over to do the same. This patch makes the reasonable assumption that copy_from_user_page() won't fail - which is probably fine, as we've checked the VMA from which we're copying is usable, and the copy is not allowed to cross VMAs. The one case where it might go wrong is if the VMA is a device rather than RAM, and that device returns an error which - in which case rubbish will be returned rather than EIO. Signed-off-by: Jie Zhang Signed-off-by: Mike Frysinger Signed-off-by: David Howells Acked-by: David McCullough Acked-by: Paul Mundt Acked-by: Greg Ungerer Signed-off-by: Linus Torvalds --- mm/nommu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm/nommu.c') diff --git a/mm/nommu.c b/mm/nommu.c index a8d17521624ab5..17773862619b53 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1921,9 +1921,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in /* only read or write mappings where it is permitted */ if (write && vma->vm_flags & VM_MAYWRITE) - len -= copy_to_user((void *) addr, buf, len); + copy_to_user_page(vma, NULL, addr, + (void *) addr, buf, len); else if (!write && vma->vm_flags & VM_MAYREAD) - len -= copy_from_user(buf, (void *) addr, len); + copy_from_user_page(vma, NULL, addr, + buf, (void *) addr, len); else len = 0; } else { -- cgit 1.2.3-korg From 1e2ae599d37e60958c03ca5e46b1f657619a30cd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 15 Jan 2010 17:01:33 -0800 Subject: nommu: struct vm_region's vm_usage count need not be atomic The vm_usage count field in struct vm_region does not need to be atomic as it's only even modified whilst nommu_region_sem is write locked. Signed-off-by: David Howells Acked-by: Al Viro Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- mm/nommu.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/nommu.c') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 84d020bed0830e..80cfa78a8cf694 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -122,7 +122,7 @@ struct vm_region { unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ struct file *vm_file; /* the backing file or NULL */ - atomic_t vm_usage; /* region usage count */ + int vm_usage; /* region usage count (access under nommu_region_sem) */ bool vm_icache_flushed : 1; /* true if the icache has been flushed for * this region */ }; diff --git a/mm/nommu.c b/mm/nommu.c index 17773862619b53..5e39294f8ea8d2 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -552,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to) static void __put_nommu_region(struct vm_region *region) __releases(nommu_region_sem) { - kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); + kenter("%p{%d}", region, region->vm_usage); BUG_ON(!nommu_region_tree.rb_node); - if (atomic_dec_and_test(®ion->vm_usage)) { + if (--region->vm_usage == 0) { if (region->vm_top > region->vm_start) delete_nommu_region(region); up_write(&nommu_region_sem); @@ -1205,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file, if (!vma) goto error_getting_vma; - atomic_set(®ion->vm_usage, 1); + region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; @@ -1272,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file, } /* we've found a region we can share */ - atomic_inc(&pregion->vm_usage); + pregion->vm_usage++; vma->vm_region = pregion; start = pregion->vm_start; start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; @@ -1289,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_region = NULL; vma->vm_start = 0; vma->vm_end = 0; - atomic_dec(&pregion->vm_usage); + pregion->vm_usage--; pregion = NULL; goto error_just_free; } @@ -1444,7 +1444,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, /* we're only permitted to split anonymous regions that have a single * owner */ if (vma->vm_file || - atomic_read(&vma->vm_region->vm_usage) != 1) + vma->vm_region->vm_usage != 1) return -ENOMEM; if (mm->map_count >= sysctl_max_map_count) @@ -1518,7 +1518,7 @@ static int shrink_vma(struct mm_struct *mm, /* cut the backing region down to size */ region = vma->vm_region; - BUG_ON(atomic_read(®ion->vm_usage) != 1); + BUG_ON(region->vm_usage != 1); down_write(&nommu_region_sem); delete_nommu_region(region); -- cgit 1.2.3-korg From 779c10232ceb11c1b259232c4845cfb2850287b7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 15 Jan 2010 17:01:34 -0800 Subject: nommu: remove a superfluous check of vm_region::vm_usage In split_vma(), there's no need to check if the VMA being split has a region that's in use by more than one VMA because: (1) The preceding test prohibits splitting of non-anonymous VMAs and regions (eg: file or chardev backed VMAs). (2) Anonymous regions can't be mapped multiple times because there's no handle by which to refer to the already existing region. (3) If a VMA has previously been split, then the region backing it has also been split into two regions, each of usage 1. Signed-off-by: David Howells Acked-by: Al Viro Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm/nommu.c') diff --git a/mm/nommu.c b/mm/nommu.c index 5e39294f8ea8d2..d6dd656264a235 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1441,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, kenter(""); - /* we're only permitted to split anonymous regions that have a single - * owner */ - if (vma->vm_file || - vma->vm_region->vm_usage != 1) + /* we're only permitted to split anonymous regions (these should have + * only a single usage on the region) */ + if (vma->vm_file) return -ENOMEM; if (mm->map_count >= sysctl_max_map_count) -- cgit 1.2.3-korg From efc1a3b16930c41d64ffefde16b87d82f603a8a0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 15 Jan 2010 17:01:35 -0800 Subject: nommu: don't need get_unmapped_area() for NOMMU get_unmapped_area() is unnecessary for NOMMU as no-one calls it. Signed-off-by: David Howells Acked-by: Al Viro Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ include/linux/sched.h | 7 +++++-- mm/nommu.c | 21 --------------------- mm/util.c | 2 +- 4 files changed, 8 insertions(+), 24 deletions(-) (limited to 'mm/nommu.c') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 80cfa78a8cf694..36f96271306c72 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -205,10 +205,12 @@ struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; struct vm_area_struct * mmap_cache; /* last find_vma result */ +#ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); void (*unmap_area) (struct mm_struct *mm, unsigned long addr); +#endif unsigned long mmap_base; /* base of mmap area */ unsigned long task_size; /* size of task vm space */ unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8d4991be9d5380..6f7bba93929bf5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -377,6 +377,8 @@ extern int sysctl_max_map_count; #include +#ifdef CONFIG_MMU +extern void arch_pick_mmap_layout(struct mm_struct *mm); extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); @@ -386,6 +388,9 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long flags); extern void arch_unmap_area(struct mm_struct *, unsigned long); extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); +#else +static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} +#endif #if USE_SPLIT_PTLOCKS /* @@ -2491,8 +2496,6 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ -extern void arch_pick_mmap_layout(struct mm_struct *mm); - #ifdef CONFIG_TRACING extern void __trace_special(void *__tr, void *__data, diff --git a/mm/nommu.c b/mm/nommu.c index d6dd656264a235..32be0cf51ba681 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1760,27 +1760,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/* - * ask for an unmapped area at which to create a mapping on a file - */ -unsigned long get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - unsigned long (*get_area)(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); - - get_area = current->mm->get_unmapped_area; - if (file && file->f_op && file->f_op->get_unmapped_area) - get_area = file->f_op->get_unmapped_area; - - if (!get_area) - return -ENOSYS; - - return get_area(file, addr, len, pgoff, flags); -} -EXPORT_SYMBOL(get_unmapped_area); - /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to diff --git a/mm/util.c b/mm/util.c index 7c35ad95f92756..834db7be240f3e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -220,7 +220,7 @@ char *strndup_user(const char __user *s, long n) } EXPORT_SYMBOL(strndup_user); -#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; -- cgit 1.2.3-korg From 7e6608724c640924aad1d556d17df33ebaa6124d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 15 Jan 2010 17:01:39 -0800 Subject: nommu: fix shared mmap after truncate shrinkage problems Fix a problem in NOMMU mmap with ramfs whereby a shared mmap can happen over the end of a truncation. The problem is that ramfs_nommu_check_mappings() checks that the reduced file size against the VMA tree, but not the vm_region tree. The following sequence of events can cause the problem: fd = open("/tmp/x", O_RDWR|O_TRUNC|O_CREAT, 0600); ftruncate(fd, 32 * 1024); a = mmap(NULL, 32 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); b = mmap(NULL, 16 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); munmap(a, 32 * 1024); ftruncate(fd, 16 * 1024); c = mmap(NULL, 32 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); Mapping 'a' creates a vm_region covering 32KB of the file. Mapping 'b' sees that the vm_region from 'a' is covering the region it wants and so shares it, pinning it in memory. Mapping 'a' then goes away and the file is truncated to the end of VMA 'b'. However, the region allocated by 'a' is still in effect, and has _not_ been reduced. Mapping 'c' is then created, and because there's a vm_region covering the desired region, get_unmapped_area() is _not_ called to repeat the check, and the mapping is granted, even though the pages from the latter half of the mapping have been discarded. However: d = mmap(NULL, 16 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); Mapping 'd' should work, and should end up sharing the region allocated by 'a'. To deal with this, we shrink the vm_region struct during the truncation, lest do_mmap_pgoff() take it as licence to share the full region automatically without calling the get_unmapped_area() file op again. Signed-off-by: David Howells Acked-by: Al Viro Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 31 +------------------------- include/linux/mm.h | 1 + mm/nommu.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 30 deletions(-) (limited to 'mm/nommu.c') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 266531343aae05..1739a4aba25fc0 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -121,35 +121,6 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) return ret; } -/*****************************************************************************/ -/* - * check that file shrinkage doesn't leave any VMAs dangling in midair - */ -static int ramfs_nommu_check_mappings(struct inode *inode, - size_t newsize, size_t size) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - - down_write(&nommu_region_sem); - - /* search for VMAs that fall within the dead zone */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - newsize >> PAGE_SHIFT, - (size + PAGE_SIZE - 1) >> PAGE_SHIFT - ) { - /* found one - only interested if it's shared out of the page - * cache */ - if (vma->vm_flags & VM_SHARED) { - up_write(&nommu_region_sem); - return -ETXTBSY; /* not quite true, but near enough */ - } - } - - up_write(&nommu_region_sem); - return 0; -} - /*****************************************************************************/ /* * @@ -169,7 +140,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) /* check that a decrease in size doesn't cut off any shared mappings */ if (newsize < size) { - ret = ramfs_nommu_check_mappings(inode, newsize, size); + ret = nommu_shrink_inode_mappings(inode, size, newsize); if (ret < 0) return ret; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 2265f28eb47a92..60c467bfbabd66 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1089,6 +1089,7 @@ extern void zone_pcp_update(struct zone *zone); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; +extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); /* prio_tree.c */ void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); diff --git a/mm/nommu.c b/mm/nommu.c index 32be0cf51ba681..48a2ecfaf05947 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1914,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in mmput(mm); return len; } + +/** + * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode + * @inode: The inode to check + * @size: The current filesize of the inode + * @newsize: The proposed filesize of the inode + * + * Check the shared mappings on an inode on behalf of a shrinking truncate to + * make sure that that any outstanding VMAs aren't broken and then shrink the + * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't + * automatically grant mappings that are too large. + */ +int nommu_shrink_inode_mappings(struct inode *inode, size_t size, + size_t newsize) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + struct vm_region *region; + pgoff_t low, high; + size_t r_size, r_top; + + low = newsize >> PAGE_SHIFT; + high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + + down_write(&nommu_region_sem); + + /* search for VMAs that fall within the dead zone */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + low, high) { + /* found one - only interested if it's shared out of the page + * cache */ + if (vma->vm_flags & VM_SHARED) { + up_write(&nommu_region_sem); + return -ETXTBSY; /* not quite true, but near enough */ + } + } + + /* reduce any regions that overlap the dead zone - if in existence, + * these will be pointed to by VMAs that don't overlap the dead zone + * + * we don't check for any regions that start beyond the EOF as there + * shouldn't be any + */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { + if (!(vma->vm_flags & VM_SHARED)) + continue; + + region = vma->vm_region; + r_size = region->vm_top - region->vm_start; + r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; + + if (r_top > newsize) { + region->vm_top -= r_top - newsize; + if (region->vm_end > region->vm_top) + region->vm_end = region->vm_top; + } + } + + up_write(&nommu_region_sem); + return 0; +} -- cgit 1.2.3-korg