From dca496451bddea9aa87b7510dc2eb413d1a19dfd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:01 -0400 Subject: percpu: move common parts out of pcpu_[de]populate_chunk() percpu-vm and percpu-km implement separate versions of pcpu_[de]populate_chunk() and some part which is or should be common are currently in the specific implementations. Make the following changes. * Allocate area clearing is moved from the pcpu_populate_chunk() implementations to pcpu_alloc(). This makes percpu-km's version noop. * Quick exit tests in pcpu_[de]populate_chunk() of percpu-vm are moved to their respective callers so that they are applied to percpu-km too. This doesn't make any meaningful difference as both functions are noop for percpu-km; however, this is more consistent and will help implementing atomic allocation support. Signed-off-by: Tejun Heo --- mm/percpu.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index da997f9800bdea..6087384f6ef014 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -709,7 +709,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - int slot, off, new_alloc; + int slot, off, new_alloc, cpu; + int page_start, page_end, rs, re; unsigned long flags; void __percpu *ptr; @@ -802,17 +803,32 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) area_found: spin_unlock_irqrestore(&pcpu_lock, flags); - /* populate, map and clear the area */ - if (pcpu_populate_chunk(chunk, off, size)) { - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_free_area(chunk, off); - err = "failed to populate"; - goto fail_unlock; + /* populate if not all pages are already there */ + page_start = PFN_DOWN(off); + page_end = PFN_UP(off + size); + + rs = page_start; + pcpu_next_pop(chunk, &rs, &re, page_end); + + if (rs != page_start || re != page_end) { + WARN_ON(chunk->immutable); + + if (pcpu_populate_chunk(chunk, off, size)) { + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_free_area(chunk, off); + err = "failed to populate"; + goto fail_unlock; + } + + bitmap_set(chunk->populated, page_start, page_end - page_start); } mutex_unlock(&pcpu_alloc_mutex); - /* return address relative to base address */ + /* clear the areas and return address relative to base address */ + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); + ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size); return ptr; @@ -903,7 +919,12 @@ static void pcpu_reclaim(struct work_struct *work) spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + int rs = 0, re; + + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(pcpu_unit_size)); + if (rs || re != PFN_UP(pcpu_unit_size)) + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + pcpu_destroy_chunk(chunk); } -- cgit 1.2.3-korg From a93ace487a339dccf7040be7fee08c3415188e14 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:02 -0400 Subject: percpu: move region iterations out of pcpu_[de]populate_chunk() Previously, pcpu_[de]populate_chunk() were called with the range which may contain multiple target regions in it and pcpu_[de]populate_chunk() iterated over the regions. This has the benefit of batching up cache flushes for all the regions; however, we're planning to add more bookkeeping logic around [de]population to support atomic allocations and this delegation of iterations gets in the way. This patch moves the region iterations out of pcpu_[de]populate_chunk() into its callers - pcpu_alloc() and pcpu_reclaim() - so that we can later add logic to track more states around them. This change may make cache and tlb flushes more frequent but multi-region [de]populations are rare anyway and if this actually becomes a problem, it's not difficult to factor out cache flushes as separate callbacks which are directly invoked from percpu.c. Signed-off-by: Tejun Heo --- mm/percpu-km.c | 6 ++++-- mm/percpu-vm.c | 57 ++++++++++++++++----------------------------------------- mm/percpu.c | 19 ++++++++----------- 3 files changed, 28 insertions(+), 54 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 9a9096f088670a..a6e34bc1aff4ac 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -33,12 +33,14 @@ #include -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { return 0; } -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { /* nada */ } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index edf70979331848..538998a137d24e 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -261,8 +261,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest - * @off: offset to the area to populate - * @size: size of the area to populate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. @@ -270,66 +270,43 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int free_end = page_start, unmap_end = page_start; struct page **pages; - int rs, re, rc; pages = pcpu_get_pages(chunk); if (!pages) return -ENOMEM; - /* alloc and map */ - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_alloc_pages(chunk, pages, rs, re); - if (rc) - goto err_free; - free_end = re; - } + if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) + return -ENOMEM; - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_map_pages(chunk, pages, rs, re); - if (rc) - goto err_unmap; - unmap_end = re; + if (pcpu_map_pages(chunk, pages, page_start, page_end)) { + pcpu_free_pages(chunk, pages, page_start, page_end); + return -ENOMEM; } pcpu_post_map_flush(chunk, page_start, page_end); return 0; - -err_unmap: - pcpu_pre_unmap_flush(chunk, page_start, unmap_end); - pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) - pcpu_unmap_pages(chunk, pages, rs, re); - pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); -err_free: - pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) - pcpu_free_pages(chunk, pages, rs, re); - return rc; } /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. + * from @chunk. * * CONTEXT: * pcpu_alloc_mutex. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); struct page **pages; - int rs, re; /* * If control reaches here, there must have been at least one @@ -342,13 +319,11 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_unmap_pages(chunk, pages, rs, re); + pcpu_unmap_pages(chunk, pages, page_start, page_end); /* no need to flush tlb, vmalloc will handle it lazily */ - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_free_pages(chunk, pages, rs, re); + pcpu_free_pages(chunk, pages, page_start, page_end); } static struct pcpu_chunk *pcpu_create_chunk(void) diff --git a/mm/percpu.c b/mm/percpu.c index 6087384f6ef014..fe5de97d7caadd 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -807,20 +807,17 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); - rs = page_start; - pcpu_next_pop(chunk, &rs, &re, page_end); - - if (rs != page_start || re != page_end) { + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { WARN_ON(chunk->immutable); - if (pcpu_populate_chunk(chunk, off, size)) { + if (pcpu_populate_chunk(chunk, rs, re)) { spin_lock_irqsave(&pcpu_lock, flags); pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } - bitmap_set(chunk->populated, page_start, page_end - page_start); + bitmap_set(chunk->populated, rs, re - rs); } mutex_unlock(&pcpu_alloc_mutex); @@ -919,12 +916,12 @@ static void pcpu_reclaim(struct work_struct *work) spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &todo, list) { - int rs = 0, re; - - pcpu_next_unpop(chunk, &rs, &re, PFN_UP(pcpu_unit_size)); - if (rs || re != PFN_UP(pcpu_unit_size)) - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + int rs, re; + pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_depopulate_chunk(chunk, rs, re); + bitmap_clear(chunk->populated, rs, re - rs); + } pcpu_destroy_chunk(chunk); } -- cgit 1.2.3-korg From b38d08f3181c5025a7ce84646494cc4748492a3b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:02 -0400 Subject: percpu: restructure locking At first, the percpu allocator required a sleepable context for both alloc and free paths and used pcpu_alloc_mutex to protect everything. Later, pcpu_lock was introduced to protect the index data structure so that the free path can be invoked from atomic contexts. The conversion only updated what's necessary and left most of the allocation path under pcpu_alloc_mutex. The percpu allocator is planned to add support for atomic allocation and this patch restructures locking so that the coverage of pcpu_alloc_mutex is further reduced. * pcpu_alloc() now grab pcpu_alloc_mutex only while creating a new chunk and populating the allocated area. Everything else is now protected soley by pcpu_lock. After this change, multiple instances of pcpu_extend_area_map() may race but the function already implements sufficient synchronization using pcpu_lock. This also allows multiple allocators to arrive at new chunk creation. To avoid creating multiple empty chunks back-to-back, a new chunk is created iff there is no other empty chunk after grabbing pcpu_alloc_mutex. * pcpu_lock is now held while modifying chunk->populated bitmap. After this, all data structures are protected by pcpu_lock. Signed-off-by: Tejun Heo --- mm/percpu-km.c | 2 ++ mm/percpu.c | 75 +++++++++++++++++++++++++++------------------------------- 2 files changed, 37 insertions(+), 40 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 67a971b7f74540..e662b4947a6572 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -68,7 +68,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->data = pages; chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + spin_lock_irq(&pcpu_lock); bitmap_fill(chunk->populated, nr_pages); + spin_unlock_irq(&pcpu_lock); return chunk; } diff --git a/mm/percpu.c b/mm/percpu.c index fe5de97d7caadd..e59f7b405bed43 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -152,31 +152,12 @@ static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; /* - * Synchronization rules. - * - * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks, populated bitmap and - * vmalloc mapping. The latter is a spinlock and protects the index - * data structures - chunk slots, chunks and area maps in chunks. - * - * During allocation, pcpu_alloc_mutex is kept locked all the time and - * pcpu_lock is grabbed and released as necessary. All actual memory - * allocations are done using GFP_KERNEL with pcpu_lock released. In - * general, percpu memory can't be allocated with irq off but - * irqsave/restore are still used in alloc path so that it can be used - * from early init path - sched_init() specifically. - * - * Free path accesses and alters only the index data structures, so it - * can be safely called from atomic context. When memory needs to be - * returned to the system, free path schedules reclaim_work which - * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be - * reclaimed, release both locks and frees the chunks. Note that it's - * necessary to grab both locks to remove a chunk from circulation as - * allocation path might be referencing the chunk with only - * pcpu_alloc_mutex locked. + * Free path accesses and alters only the index data structures and can be + * safely called from atomic context. When memory needs to be returned to + * the system, free path schedules reclaim_work. */ -static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ -static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ +static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ @@ -709,7 +690,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - int slot, off, new_alloc, cpu; + int slot, off, new_alloc, cpu, ret; int page_start, page_end, rs, re; unsigned long flags; void __percpu *ptr; @@ -729,7 +710,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) return NULL; } - mutex_lock(&pcpu_alloc_mutex); spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ @@ -745,7 +725,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map of reserved chunk"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); } @@ -771,7 +751,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) if (pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); /* @@ -787,37 +767,53 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) } } - /* hmmm... no space left, create a new chunk */ spin_unlock_irqrestore(&pcpu_lock, flags); - chunk = pcpu_create_chunk(); - if (!chunk) { - err = "failed to allocate new chunk"; - goto fail_unlock_mutex; + /* + * No space left. Create a new chunk. We don't want multiple + * tasks to create chunks simultaneously. Serialize and create iff + * there's still no empty chunk after grabbing the mutex. + */ + mutex_lock(&pcpu_alloc_mutex); + + if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { + chunk = pcpu_create_chunk(); + if (!chunk) { + err = "failed to allocate new chunk"; + goto fail; + } + + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_chunk_relocate(chunk, -1); + } else { + spin_lock_irqsave(&pcpu_lock, flags); } - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_chunk_relocate(chunk, -1); + mutex_unlock(&pcpu_alloc_mutex); goto restart; area_found: spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ + mutex_lock(&pcpu_alloc_mutex); page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { WARN_ON(chunk->immutable); - if (pcpu_populate_chunk(chunk, rs, re)) { - spin_lock_irqsave(&pcpu_lock, flags); + ret = pcpu_populate_chunk(chunk, rs, re); + + spin_lock_irqsave(&pcpu_lock, flags); + if (ret) { + mutex_unlock(&pcpu_alloc_mutex); pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } - bitmap_set(chunk->populated, rs, re - rs); + spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); @@ -832,8 +828,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); -fail_unlock_mutex: - mutex_unlock(&pcpu_alloc_mutex); +fail: if (warn_limit) { pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " "%s\n", size, align, err); -- cgit 1.2.3-korg From a16037c8dfc2734c1a2c8e3ffd4766ed25f2a41d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:02 -0400 Subject: percpu: make pcpu_alloc_area() capable of allocating only from populated areas Update pcpu_alloc_area() so that it can skip unpopulated areas if the new parameter @pop_only is true. This is implemented by a new function, pcpu_fit_in_area(), which determines the amount of head padding considering the alignment and populated state. @pop_only is currently always false but this will be used to implement atomic allocation. Signed-off-by: Tejun Heo --- mm/percpu.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 7 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index e59f7b405bed43..e18aa143aab15d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -399,11 +399,61 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) return 0; } +/** + * pcpu_fit_in_area - try to fit the requested allocation in a candidate area + * @chunk: chunk the candidate area belongs to + * @off: the offset to the start of the candidate area + * @this_size: the size of the candidate area + * @size: the size of the target allocation + * @align: the alignment of the target allocation + * @pop_only: only allocate from already populated region + * + * We're trying to allocate @size bytes aligned at @align. @chunk's area + * at @off sized @this_size is a candidate. This function determines + * whether the target allocation fits in the candidate area and returns the + * number of bytes to pad after @off. If the target area doesn't fit, -1 + * is returned. + * + * If @pop_only is %true, this function only considers the already + * populated part of the candidate area. + */ +static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, + int size, int align, bool pop_only) +{ + int cand_off = off; + + while (true) { + int head = ALIGN(cand_off, align) - off; + int page_start, page_end, rs, re; + + if (this_size < head + size) + return -1; + + if (!pop_only) + return head; + + /* + * If the first unpopulated page is beyond the end of the + * allocation, the whole allocation is populated; + * otherwise, retry from the end of the unpopulated area. + */ + page_start = PFN_DOWN(head + off); + page_end = PFN_UP(head + off + size); + + rs = page_start; + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); + if (rs >= page_end) + return head; + cand_off = re * PAGE_SIZE; + } +} + /** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes * @align: wanted align + * @pop_only: allocate only from the populated area * * Try to allocate @size bytes area aligned at @align from @chunk. * Note that this function only allocates the offset. It doesn't @@ -418,7 +468,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) * Allocated offset in @chunk on success, -1 if no matching area is * found. */ -static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, + bool pop_only) { int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; @@ -434,11 +485,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) if (off & 1) continue; - /* extra for alignment requirement */ - head = ALIGN(off, align) - off; - this_size = (p[1] & ~1) - off; - if (this_size < head + size) { + + head = pcpu_fit_in_area(chunk, off, this_size, size, align, + pop_only); + if (head < 0) { if (!seen_free) { chunk->first_free = i; seen_free = true; @@ -730,7 +781,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, false); if (off >= 0) goto area_found; @@ -761,7 +812,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) goto restart; } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, false); if (off >= 0) goto area_found; } -- cgit 1.2.3-korg From e04d320838f573d8fa989a0d7af0972f9b0142d9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:04 -0400 Subject: percpu: indent the population block in pcpu_alloc() The next patch will conditionalize the population block in pcpu_alloc() which will end up making a rather large indentation change obfuscating the actual logic change. This patch puts the block under "if (true)" so that the next patch can avoid indentation changes. The defintions of the local variables which are used only in the block are moved into the block. This patch is purely cosmetic. Signed-off-by: Tejun Heo --- mm/percpu.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index e18aa143aab15d..577d84fb30023b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -742,7 +742,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) struct pcpu_chunk *chunk; const char *err; int slot, off, new_alloc, cpu, ret; - int page_start, page_end, rs, re; unsigned long flags; void __percpu *ptr; @@ -847,27 +846,32 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ - mutex_lock(&pcpu_alloc_mutex); - page_start = PFN_DOWN(off); - page_end = PFN_UP(off + size); + if (true) { + int page_start, page_end, rs, re; - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - WARN_ON(chunk->immutable); + mutex_lock(&pcpu_alloc_mutex); - ret = pcpu_populate_chunk(chunk, rs, re); + page_start = PFN_DOWN(off); + page_end = PFN_UP(off + size); - spin_lock_irqsave(&pcpu_lock, flags); - if (ret) { - mutex_unlock(&pcpu_alloc_mutex); - pcpu_free_area(chunk, off); - err = "failed to populate"; - goto fail_unlock; + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + WARN_ON(chunk->immutable); + + ret = pcpu_populate_chunk(chunk, rs, re); + + spin_lock_irqsave(&pcpu_lock, flags); + if (ret) { + mutex_unlock(&pcpu_alloc_mutex); + pcpu_free_area(chunk, off); + err = "failed to populate"; + goto fail_unlock; + } + bitmap_set(chunk->populated, rs, re - rs); + spin_unlock_irqrestore(&pcpu_lock, flags); } - bitmap_set(chunk->populated, rs, re - rs); - spin_unlock_irqrestore(&pcpu_lock, flags); - } - mutex_unlock(&pcpu_alloc_mutex); + mutex_unlock(&pcpu_alloc_mutex); + } /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) -- cgit 1.2.3-korg From 5835d96e9ce4efdba8c6cefffc2f1575925456de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:04 -0400 Subject: percpu: implement [__]alloc_percpu_gfp() Now that pcpu_alloc_area() can allocate only from populated areas, it's easy to add atomic allocation support to [__]alloc_percpu(). Update pcpu_alloc() so that it accepts @gfp and skips all the blocking operations and allocates only from the populated areas if @gfp doesn't contain GFP_KERNEL. New interface functions [__]alloc_percpu_gfp() are added. While this means that atomic allocations are possible, this isn't complete yet as there's no mechanism to ensure that certain amount of populated areas is kept available and atomic allocations may keep failing under certain conditions. Signed-off-by: Tejun Heo --- include/linux/percpu.h | 9 +++++-- mm/percpu.c | 64 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 46 insertions(+), 27 deletions(-) (limited to 'mm/percpu.c') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 6f61b61b7996a8..d1b416da25ed40 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -122,11 +122,16 @@ extern void __init setup_per_cpu_areas(void); #endif extern void __init percpu_init_late(void); +extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); extern void __percpu *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void __percpu *__pdata); extern phys_addr_t per_cpu_ptr_to_phys(void *addr); -#define alloc_percpu(type) \ - (typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type)) +#define alloc_percpu_gfp(type, gfp) \ + (typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \ + __alignof__(type), gfp) +#define alloc_percpu(type) \ + (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ + __alignof__(type)) #endif /* __LINUX_PERCPU_H */ diff --git a/mm/percpu.c b/mm/percpu.c index 577d84fb30023b..c52b93117dc20a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -151,11 +151,6 @@ static struct pcpu_chunk *pcpu_first_chunk; static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; -/* - * Free path accesses and alters only the index data structures and can be - * safely called from atomic context. When memory needs to be returned to - * the system, free path schedules reclaim_work. - */ static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ @@ -727,20 +722,21 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available + * @gfp: allocation flags * - * Allocate percpu area of @size bytes aligned at @align. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't + * contain %GFP_KERNEL, the allocation is atomic. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, + gfp_t gfp) { static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; + bool is_atomic = !(gfp & GFP_KERNEL); int slot, off, new_alloc, cpu, ret; unsigned long flags; void __percpu *ptr; @@ -773,14 +769,15 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) while ((new_alloc = pcpu_need_to_extend(chunk))) { spin_unlock_irqrestore(&pcpu_lock, flags); - if (pcpu_extend_area_map(chunk, new_alloc) < 0) { + if (is_atomic || + pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map of reserved chunk"; goto fail; } spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align, false); + off = pcpu_alloc_area(chunk, size, align, is_atomic); if (off >= 0) goto area_found; @@ -797,6 +794,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) new_alloc = pcpu_need_to_extend(chunk); if (new_alloc) { + if (is_atomic) + continue; spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { @@ -811,7 +810,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) goto restart; } - off = pcpu_alloc_area(chunk, size, align, false); + off = pcpu_alloc_area(chunk, size, align, is_atomic); if (off >= 0) goto area_found; } @@ -824,6 +823,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) * tasks to create chunks simultaneously. Serialize and create iff * there's still no empty chunk after grabbing the mutex. */ + if (is_atomic) + goto fail; + mutex_lock(&pcpu_alloc_mutex); if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { @@ -846,7 +848,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ - if (true) { + if (!is_atomic) { int page_start, page_end, rs, re; mutex_lock(&pcpu_alloc_mutex); @@ -884,9 +886,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: - if (warn_limit) { - pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " - "%s\n", size, align, err); + if (!is_atomic && warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); dump_stack(); if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); @@ -895,22 +897,34 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) } /** - * __alloc_percpu - allocate dynamic percpu area + * __alloc_percpu_gfp - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) + * @gfp: allocation flags * - * Allocate zero-filled percpu area of @size bytes aligned at @align. - * Might sleep. Might trigger writeouts. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate zero-filled percpu area of @size bytes aligned at @align. If + * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can + * be called from any context but is a lot more likely to fail. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ +void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) +{ + return pcpu_alloc(size, align, false, gfp); +} +EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). + */ void __percpu *__alloc_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, false); + return pcpu_alloc(size, align, false, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__alloc_percpu); @@ -932,7 +946,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); */ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, true); + return pcpu_alloc(size, align, true, GFP_KERNEL); } /** -- cgit 1.2.3-korg From 9c824b6a172c8d44a6b037946bae90127c969b1b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: make sure chunk->map array has available space An allocation attempt may require extending chunk->map array which requires GFP_KERNEL context which isn't available for atomic allocations. This patch ensures that chunk->map array usually keeps some amount of available space by directly allocating buffer space during GFP_KERNEL allocations and scheduling async extension during atomic ones. This should make atomic allocation failures from map space exhaustion rare. Signed-off-by: Tejun Heo --- mm/percpu.c | 53 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 8 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index c52b93117dc20a..546ced05cf33b7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -76,6 +76,8 @@ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +#define PCPU_ATOMIC_MAP_MARGIN_LOW 32 +#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ @@ -102,9 +104,12 @@ struct pcpu_chunk { int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ + int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + struct work_struct map_extend_work;/* async ->map[] extension */ + void *data; /* chunk data */ int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ @@ -318,9 +323,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) /** * pcpu_need_to_extend - determine whether chunk area map needs to be extended * @chunk: chunk of interest + * @is_atomic: the allocation context * - * Determine whether area map of @chunk needs to be extended to - * accommodate a new allocation. + * Determine whether area map of @chunk needs to be extended. If + * @is_atomic, only the amount necessary for a new allocation is + * considered; however, async extension is scheduled if the left amount is + * low. If !@is_atomic, it aims for more empty space. Combined, this + * ensures that the map is likely to have enough available space to + * accomodate atomic allocations which can't extend maps directly. * * CONTEXT: * pcpu_lock. @@ -329,15 +339,25 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) * New target map allocation length if extension is necessary, 0 * otherwise. */ -static int pcpu_need_to_extend(struct pcpu_chunk *chunk) +static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) { - int new_alloc; + int margin, new_alloc; + + if (is_atomic) { + margin = 3; - if (chunk->map_alloc >= chunk->map_used + 3) + if (chunk->map_alloc < + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) + schedule_work(&chunk->map_extend_work); + } else { + margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; + } + + if (chunk->map_alloc >= chunk->map_used + margin) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 3) + while (new_alloc < chunk->map_used + margin) new_alloc *= 2; return new_alloc; @@ -394,6 +414,20 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) return 0; } +static void pcpu_map_extend_workfn(struct work_struct *work) +{ + struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, + map_extend_work); + int new_alloc; + + spin_lock_irq(&pcpu_lock); + new_alloc = pcpu_need_to_extend(chunk, false); + spin_unlock_irq(&pcpu_lock); + + if (new_alloc) + pcpu_extend_area_map(chunk, new_alloc); +} + /** * pcpu_fit_in_area - try to fit the requested allocation in a candidate area * @chunk: chunk the candidate area belongs to @@ -647,6 +681,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); + INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; @@ -767,7 +802,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, goto fail_unlock; } - while ((new_alloc = pcpu_need_to_extend(chunk))) { + while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { spin_unlock_irqrestore(&pcpu_lock, flags); if (is_atomic || pcpu_extend_area_map(chunk, new_alloc) < 0) { @@ -792,7 +827,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, if (size > chunk->contig_hint) continue; - new_alloc = pcpu_need_to_extend(chunk); + new_alloc = pcpu_need_to_extend(chunk, is_atomic); if (new_alloc) { if (is_atomic) continue; @@ -1418,6 +1453,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); + INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); @@ -1446,6 +1482,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (dyn_size) { dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); + INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); -- cgit 1.2.3-korg From b539b87fed37ffc16c89a6bc3beca2d7aed82e1c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: implmeent pcpu_nr_empty_pop_pages and chunk->nr_populated pcpu_nr_empty_pop_pages counts the number of empty populated pages across all chunks and chunk->nr_populated counts the number of populated pages in a chunk. Both will be used to implement pre/async population for atomic allocations. pcpu_chunk_[de]populated() are added to update chunk->populated, chunk->nr_populated and pcpu_nr_empty_pop_pages together. All successful chunk [de]populations should be followed by the corresponding pcpu_chunk_[de]populated() calls. Signed-off-by: Tejun Heo --- mm/percpu-km.c | 2 +- mm/percpu.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 114 insertions(+), 10 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index e662b4947a6572..10e3d0b8a86d1b 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; spin_lock_irq(&pcpu_lock); - bitmap_fill(chunk->populated, nr_pages); + pcpu_chunk_populated(chunk, 0, nr_pages); spin_unlock_irq(&pcpu_lock); return chunk; diff --git a/mm/percpu.c b/mm/percpu.c index 546ced05cf33b7..4f2d58760c9c25 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -113,6 +113,7 @@ struct pcpu_chunk { void *data; /* chunk data */ int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ + int nr_populated; /* # of populated pages */ unsigned long populated[]; /* populated bitmap */ }; @@ -161,6 +162,12 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +/* + * The number of empty populated pages, protected by pcpu_lock. The + * reserved chunk doesn't contribute to the count. + */ +static int pcpu_nr_empty_pop_pages; + /* reclaim work to release fully free chunks, scheduled from free path */ static void pcpu_reclaim(struct work_struct *work); static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); @@ -295,6 +302,38 @@ static void pcpu_mem_free(void *ptr, size_t size) vfree(ptr); } +/** + * pcpu_count_occupied_pages - count the number of pages an area occupies + * @chunk: chunk of interest + * @i: index of the area in question + * + * Count the number of pages chunk's @i'th area occupies. When the area's + * start and/or end address isn't aligned to page boundary, the straddled + * page is included in the count iff the rest of the page is free. + */ +static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) +{ + int off = chunk->map[i] & ~1; + int end = chunk->map[i + 1] & ~1; + + if (!PAGE_ALIGNED(off) && i > 0) { + int prev = chunk->map[i - 1]; + + if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) + off = round_down(off, PAGE_SIZE); + } + + if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { + int next = chunk->map[i + 1]; + int nend = chunk->map[i + 2] & ~1; + + if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) + end = round_up(end, PAGE_SIZE); + } + + return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); +} + /** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest @@ -483,6 +522,7 @@ static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, * @size: wanted size in bytes * @align: wanted align * @pop_only: allocate only from the populated area + * @occ_pages_p: out param for the number of pages the area occupies * * Try to allocate @size bytes area aligned at @align from @chunk. * Note that this function only allocates the offset. It doesn't @@ -498,7 +538,7 @@ static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, * found. */ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, - bool pop_only) + bool pop_only, int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; @@ -587,6 +627,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, chunk->free_size -= size; *p |= 1; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); pcpu_chunk_relocate(chunk, oslot); return off; } @@ -602,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, * pcpu_free_area - free area to a pcpu_chunk * @chunk: chunk of interest * @freeme: offset of area to free + * @occ_pages_p: out param for the number of pages the area occupies * * Free area starting from @freeme to @chunk. Note that this function * only modifies the allocation map. It doesn't depopulate or unmap @@ -610,7 +652,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, * CONTEXT: * pcpu_lock. */ -static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, + int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int off = 0; @@ -641,6 +684,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) *p = off &= ~1; chunk->free_size += (p[1] & ~1) - off; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); + /* merge with next? */ if (!(p[1] & 1)) to_free++; @@ -696,6 +741,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) pcpu_mem_free(chunk, pcpu_chunk_struct_size); } +/** + * pcpu_chunk_populated - post-population bookkeeping + * @chunk: pcpu_chunk which got populated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been populated to @chunk. Update + * the bookkeeping information accordingly. Must be called after each + * successful population. + */ +static void pcpu_chunk_populated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_set(chunk->populated, page_start, nr); + chunk->nr_populated += nr; + pcpu_nr_empty_pop_pages += nr; +} + +/** + * pcpu_chunk_depopulated - post-depopulation bookkeeping + * @chunk: pcpu_chunk which got depopulated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been depopulated from @chunk. + * Update the bookkeeping information accordingly. Must be called after + * each successful depopulation. + */ +static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_clear(chunk->populated, page_start, nr); + chunk->nr_populated -= nr; + pcpu_nr_empty_pop_pages -= nr; +} + /* * Chunk management implementation. * @@ -772,6 +861,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, struct pcpu_chunk *chunk; const char *err; bool is_atomic = !(gfp & GFP_KERNEL); + int occ_pages = 0; int slot, off, new_alloc, cpu, ret; unsigned long flags; void __percpu *ptr; @@ -812,7 +902,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align, is_atomic); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; @@ -845,7 +936,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, goto restart; } - off = pcpu_alloc_area(chunk, size, align, is_atomic); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; } @@ -899,17 +991,20 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, spin_lock_irqsave(&pcpu_lock, flags); if (ret) { mutex_unlock(&pcpu_alloc_mutex); - pcpu_free_area(chunk, off); + pcpu_free_area(chunk, off, &occ_pages); err = "failed to populate"; goto fail_unlock; } - bitmap_set(chunk->populated, rs, re - rs); + pcpu_chunk_populated(chunk, rs, re); spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); } + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages -= occ_pages; + /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); @@ -1019,7 +1114,9 @@ static void pcpu_reclaim(struct work_struct *work) pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { pcpu_depopulate_chunk(chunk, rs, re); - bitmap_clear(chunk->populated, rs, re - rs); + spin_lock_irq(&pcpu_lock); + pcpu_chunk_depopulated(chunk, rs, re); + spin_unlock_irq(&pcpu_lock); } pcpu_destroy_chunk(chunk); } @@ -1041,7 +1138,7 @@ void free_percpu(void __percpu *ptr) void *addr; struct pcpu_chunk *chunk; unsigned long flags; - int off; + int off, occ_pages; if (!ptr) return; @@ -1055,7 +1152,10 @@ void free_percpu(void __percpu *ptr) chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; - pcpu_free_area(chunk, off); + pcpu_free_area(chunk, off, &occ_pages); + + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += occ_pages; /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_size == pcpu_unit_size) { @@ -1459,6 +1559,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, schunk->map_alloc = ARRAY_SIZE(smap); schunk->immutable = true; bitmap_fill(schunk->populated, pcpu_unit_pages); + schunk->nr_populated = pcpu_unit_pages; if (ai->reserved_size) { schunk->free_size = ai->reserved_size; @@ -1488,6 +1589,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->immutable = true; bitmap_fill(dchunk->populated, pcpu_unit_pages); + dchunk->nr_populated = pcpu_unit_pages; dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[0] = 1; @@ -1498,6 +1600,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; + pcpu_nr_empty_pop_pages += + pcpu_count_occupied_pages(pcpu_first_chunk, 1); pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ -- cgit 1.2.3-korg From fe6bd8c3d28357174587c4fe895d10b00321b692 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: rename pcpu_reclaim_work to pcpu_balance_work pcpu_reclaim_work will also be used to populate chunks asynchronously. Rename it to pcpu_balance_work in preparation. pcpu_reclaim() is renamed to pcpu_balance_workfn() and some of its local variables are renamed too. This is pure rename. Signed-off-by: Tejun Heo --- mm/percpu.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index 4f2d58760c9c25..28a830590b4c28 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -168,9 +168,9 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ */ static int pcpu_nr_empty_pop_pages; -/* reclaim work to release fully free chunks, scheduled from free path */ -static void pcpu_reclaim(struct work_struct *work); -static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); +/* balance work is used to populate or destroy chunks asynchronously */ +static void pcpu_balance_workfn(struct work_struct *work); +static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); static bool pcpu_addr_in_first_chunk(void *addr) { @@ -1080,36 +1080,33 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) } /** - * pcpu_reclaim - reclaim fully free chunks, workqueue function + * pcpu_balance_workfn - reclaim fully free chunks, workqueue function * @work: unused * * Reclaim all fully free chunks except for the first one. - * - * CONTEXT: - * workqueue context. */ -static void pcpu_reclaim(struct work_struct *work) +static void pcpu_balance_workfn(struct work_struct *work) { - LIST_HEAD(todo); - struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; + LIST_HEAD(to_free); + struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, head, list) { + list_for_each_entry_safe(chunk, next, free_head, list) { WARN_ON(chunk->immutable); /* spare the first one */ - if (chunk == list_first_entry(head, struct pcpu_chunk, list)) + if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; - list_move(&chunk->list, &todo); + list_move(&chunk->list, &to_free); } spin_unlock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, &todo, list) { + list_for_each_entry_safe(chunk, next, &to_free, list) { int rs, re; pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { @@ -1163,7 +1160,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - schedule_work(&pcpu_reclaim_work); + schedule_work(&pcpu_balance_work); break; } } -- cgit 1.2.3-korg From 1a4d76076cda69b0abf15463a8cebc172406da25 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: implement asynchronous chunk population The percpu allocator now supports atomic allocations by only allocating from already populated areas but the mechanism to ensure that there's adequate amount of populated areas was missing. This patch expands pcpu_balance_work so that in addition to freeing excess free chunks it also populates chunks to maintain an adequate level of populated areas. pcpu_alloc() schedules pcpu_balance_work if the amount of free populated areas is too low or after an atomic allocation failure. * PERPCU_DYNAMIC_RESERVE is increased by two pages to account for PCPU_EMPTY_POP_PAGES_LOW. * pcpu_async_enabled is added to gate both async jobs - chunk->map_extend_work and pcpu_balance_work - so that we don't end up scheduling them while the needed subsystems aren't up yet. Signed-off-by: Tejun Heo --- include/linux/percpu.h | 4 +- mm/percpu.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 115 insertions(+), 6 deletions(-) (limited to 'mm/percpu.c') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index d1b416da25ed40..a3aa63e47637c9 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -48,9 +48,9 @@ * intelligent way to determine this would be nice. */ #if BITS_PER_LONG > 32 -#define PERCPU_DYNAMIC_RESERVE (20 << 10) +#define PERCPU_DYNAMIC_RESERVE (28 << 10) #else -#define PERCPU_DYNAMIC_RESERVE (12 << 10) +#define PERCPU_DYNAMIC_RESERVE (20 << 10) #endif extern void *pcpu_base_addr; diff --git a/mm/percpu.c b/mm/percpu.c index 28a830590b4c28..867efd38d87911 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -78,6 +78,8 @@ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 #define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +#define PCPU_EMPTY_POP_PAGES_LOW 2 +#define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ @@ -168,9 +170,22 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ */ static int pcpu_nr_empty_pop_pages; -/* balance work is used to populate or destroy chunks asynchronously */ +/* + * Balance work is used to populate or destroy chunks asynchronously. We + * try to keep the number of populated free pages between + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one + * empty chunk. + */ static void pcpu_balance_workfn(struct work_struct *work); static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); +static bool pcpu_async_enabled __read_mostly; +static bool pcpu_atomic_alloc_failed; + +static void pcpu_schedule_balance_work(void) +{ + if (pcpu_async_enabled) + schedule_work(&pcpu_balance_work); +} static bool pcpu_addr_in_first_chunk(void *addr) { @@ -386,7 +401,8 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) margin = 3; if (chunk->map_alloc < - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && + pcpu_async_enabled) schedule_work(&chunk->map_extend_work); } else { margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; @@ -1005,6 +1021,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, if (chunk != pcpu_reserved_chunk) pcpu_nr_empty_pop_pages -= occ_pages; + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); @@ -1023,6 +1042,11 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); } + if (is_atomic) { + /* see the flag handling in pcpu_blance_workfn() */ + pcpu_atomic_alloc_failed = true; + pcpu_schedule_balance_work(); + } return NULL; } @@ -1080,7 +1104,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) } /** - * pcpu_balance_workfn - reclaim fully free chunks, workqueue function + * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * Reclaim all fully free chunks except for the first one. @@ -1090,7 +1114,12 @@ static void pcpu_balance_workfn(struct work_struct *work) LIST_HEAD(to_free); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; + int slot, nr_to_pop, ret; + /* + * There's no reason to keep around multiple unused chunks and VM + * areas can be scarce. Destroy all free chunks except for one. + */ mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); @@ -1118,6 +1147,74 @@ static void pcpu_balance_workfn(struct work_struct *work) pcpu_destroy_chunk(chunk); } + /* + * Ensure there are certain number of free populated pages for + * atomic allocs. Fill up from the most packed so that atomic + * allocs don't increase fragmentation. If atomic allocation + * failed previously, always populate the maximum amount. This + * should prevent atomic allocs larger than PAGE_SIZE from keeping + * failing indefinitely; however, large atomic allocs are not + * something we support properly and can be highly unreliable and + * inefficient. + */ +retry_pop: + if (pcpu_atomic_alloc_failed) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; + /* best effort anyway, don't worry about synchronization */ + pcpu_atomic_alloc_failed = false; + } else { + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - + pcpu_nr_empty_pop_pages, + 0, PCPU_EMPTY_POP_PAGES_HIGH); + } + + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { + int nr_unpop = 0, rs, re; + + if (!nr_to_pop) + break; + + spin_lock_irq(&pcpu_lock); + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + nr_unpop = pcpu_unit_pages - chunk->nr_populated; + if (nr_unpop) + break; + } + spin_unlock_irq(&pcpu_lock); + + if (!nr_unpop) + continue; + + /* @chunk can't go away while pcpu_alloc_mutex is held */ + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + int nr = min(re - rs, nr_to_pop); + + ret = pcpu_populate_chunk(chunk, rs, rs + nr); + if (!ret) { + nr_to_pop -= nr; + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, rs, rs + nr); + spin_unlock_irq(&pcpu_lock); + } else { + nr_to_pop = 0; + } + + if (!nr_to_pop) + break; + } + } + + if (nr_to_pop) { + /* ran out of chunks to populate, create a new one and retry */ + chunk = pcpu_create_chunk(); + if (chunk) { + spin_lock_irq(&pcpu_lock); + pcpu_chunk_relocate(chunk, -1); + spin_unlock_irq(&pcpu_lock); + goto retry_pop; + } + } + mutex_unlock(&pcpu_alloc_mutex); } @@ -1160,7 +1257,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - schedule_work(&pcpu_balance_work); + pcpu_schedule_balance_work(); break; } } @@ -2187,3 +2284,15 @@ void __init percpu_init_late(void) spin_unlock_irqrestore(&pcpu_lock, flags); } } + +/* + * Percpu allocator is initialized early during boot when neither slab or + * workqueue is available. Plug async management until everything is up + * and running. + */ +static int __init percpu_enable_async(void) +{ + pcpu_async_enabled = true; + return 0; +} +subsys_initcall(percpu_enable_async); -- cgit 1.2.3-korg From 23cb8981ed929b4dd48141401cd0fd31e0fa4ed0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Sep 2014 08:02:45 +0900 Subject: percpu: fix locking regression in the failure path of pcpu_alloc() While updating locking, b38d08f3181c ("percpu: restructure locking") broke pcpu_create_chunk() creation path in pcpu_alloc(). It returns without releasing pcpu_alloc_mutex. Fix it. Signed-off-by: Tejun Heo Reported-by: Julia Lawall --- mm/percpu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index 867efd38d87911..af3dd2704efd82 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -974,6 +974,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { chunk = pcpu_create_chunk(); if (!chunk) { + mutex_unlock(&pcpu_alloc_mutex); err = "failed to allocate new chunk"; goto fail; } -- cgit 1.2.3-korg From bb2e226b3bef596dd56be97df655d857b4603923 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 21 Sep 2014 15:04:53 -0700 Subject: Revert "percpu: free percpu allocation info for uniprocessor system" This reverts commit 3189eddbcafc ("percpu: free percpu allocation info for uniprocessor system"). The commit causes a hang with a crisv32 image. This may be an architecture problem, but at least for now the revert is necessary to be able to boot a crisv32 image. Cc: Tejun Heo Cc: Honggang Li Signed-off-by: Guenter Roeck Signed-off-by: Tejun Heo Fixes: 3189eddbcafc ("percpu: free percpu allocation info for uniprocessor system") Cc: stable@vger.kernel.org # Please don't apply 3189eddbcafc --- mm/percpu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index af3dd2704efd82..e10f9f7a888742 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2250,8 +2250,6 @@ void __init setup_per_cpu_areas(void) if (pcpu_setup_first_chunk(ai, fc) < 0) panic("Failed to initialize percpu areas."); - - pcpu_free_alloc_info(ai); } #endif /* CONFIG_SMP */ -- cgit 1.2.3-korg From 6ae833c7fe0c6ef1f0ab13cc775da230d6f4c256 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Oct 2014 12:01:52 -0400 Subject: percpu: fix how @gfp is interpreted by the percpu allocator When @gfp is specified, the percpu allocator is interested in whether it contains all of GFP_KERNEL or not. If it does, the normal allocation path is taken; otherwise, the atomic allocation path. Unfortunately, pcpu_alloc() was incorrectly testing for whether @gfp contains any part of GFP_KERNEL. Fix it by testing "(gfp & GFP_KERNEL) != GFP_KERNEL" instead of "!(gfp & GFP_KERNEL)" to decide whether the allocation should be atomic or not. Signed-off-by: Tejun Heo --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index e10f9f7a888742..014bab65e0ffd8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -876,7 +876,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - bool is_atomic = !(gfp & GFP_KERNEL); + bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; int occ_pages = 0; int slot, off, new_alloc, cpu, ret; unsigned long flags; -- cgit 1.2.3-korg