From f7942430e40f14c6d2ca48a1875add509938c07d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:41 -0800 Subject: lib: radix-tree: native accounting of exceptional entries The way the page cache is sneaking shadow entries of evicted pages into the radix tree past the node entry accounting and tracking them manually in the upper bits of node->count is fraught with problems. These shadow entries are marked in the tree as exceptional entries, which are a native concept to the radix tree. Maintain an explicit counter of exceptional entries in the radix tree node. Subsequent patches will switch shadow entry tracking over to that counter. DAX and shmem are the other users of exceptional entries. Since slot replacements that change the entry type from regular to exceptional must now be accounted, introduce a __radix_tree_replace() function that does replacement and accounting, and switch DAX and shmem over. The increase in radix tree node size is temporary. A followup patch switches the shadow tracking to this new scheme and we'll no longer need the upper bits in node->count and shrink that back to one byte. Link: http://lkml.kernel.org/r/20161117192945.GA23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8e6d552c40ddfc..7885796d35ae74 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -220,10 +220,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) { unsigned long i; - pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n", + pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n", node, node->offset, node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->parent); + node->shift, node->count, node->exceptional, node->parent); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { unsigned long first = index | (i << node->shift); @@ -522,8 +522,13 @@ static int radix_tree_extend(struct radix_tree_root *root, node->offset = 0; node->count = 1; node->parent = NULL; - if (radix_tree_is_internal_node(slot)) + if (radix_tree_is_internal_node(slot)) { entry_to_node(slot)->parent = node; + } else { + /* Moving an exceptional root->rnode to a node */ + if (radix_tree_exceptional_entry(slot)) + node->exceptional = 1; + } node->slots[0] = slot; slot = node_to_entry(node); rcu_assign_pointer(root->rnode, slot); @@ -649,6 +654,8 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, if (node) { unsigned offset = get_slot_offset(node, slot); node->count++; + if (radix_tree_exceptional_entry(item)) + node->exceptional++; BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); @@ -746,6 +753,37 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); +/** + * __radix_tree_replace - replace item in a slot + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * + * For use with __radix_tree_lookup(). Caller must hold tree write locked + * across slot lookup and replacement. + */ +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item) +{ + void *old = rcu_dereference_raw(*slot); + int exceptional; + + WARN_ON_ONCE(radix_tree_is_internal_node(item)); + WARN_ON_ONCE(!!item - !!old); + + exceptional = !!radix_tree_exceptional_entry(item) - + !!radix_tree_exceptional_entry(old); + + WARN_ON_ONCE(exceptional && !node && slot != (void **)&root->rnode); + + if (node) + node->exceptional += exceptional; + + rcu_assign_pointer(*slot, item); +} + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root @@ -1561,6 +1599,8 @@ void *radix_tree_delete_item(struct radix_tree_root *root, delete_sibling_entries(node, node_to_entry(slot), offset); node->slots[offset] = NULL; node->count--; + if (radix_tree_exceptional_entry(entry)) + node->exceptional--; __radix_tree_delete_node(root, node); -- cgit 1.2.3-korg From 6d75f366b9242f9b17ed7d0b0604d7460f818f21 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:43 -0800 Subject: lib: radix-tree: check accounting of existing slot replacement users The bug in khugepaged fixed earlier in this series shows that radix tree slot replacement is fragile; and it will become more so when not only NULL<->!NULL transitions need to be caught but transitions from and to exceptional entries as well. We need checks. Re-implement radix_tree_replace_slot() on top of the sanity-checked __radix_tree_replace(). This requires existing callers to also pass the radix tree root, but it'll warn us when somebody replaces slots with contents that need proper accounting (transitions between NULL entries, real entries, exceptional entries) and where a replacement through the slot pointer would corrupt the radix tree node counts. Link: http://lkml.kernel.org/r/20161117193021.GB23430@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/gmap.c | 2 +- drivers/sh/intc/virq.c | 2 +- fs/dax.c | 4 +-- include/linux/radix-tree.h | 16 ++------- lib/radix-tree.c | 63 +++++++++++++++++++++++++++-------- mm/filemap.c | 4 +-- mm/khugepaged.c | 5 +-- mm/migrate.c | 4 +-- mm/truncate.c | 2 +- tools/testing/radix-tree/multiorder.c | 2 +- 10 files changed, 64 insertions(+), 40 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 3ba622702ce47c..ec1f0dedb948df 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1015,7 +1015,7 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); - radix_tree_replace_slot(slot, rmap); + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, diff --git a/drivers/sh/intc/virq.c b/drivers/sh/intc/virq.c index e7899624aa0b63..35bbe288ddb4eb 100644 --- a/drivers/sh/intc/virq.c +++ b/drivers/sh/intc/virq.c @@ -254,7 +254,7 @@ static void __init intc_subgroup_map(struct intc_desc_int *d) radix_tree_tag_clear(&d->tree, entry->enum_id, INTC_TAG_VIRQ_NEEDS_ALLOC); - radix_tree_replace_slot((void **)entries[i], + radix_tree_replace_slot(&d->tree, (void **)entries[i], &intc_irq_xlate[irq]); } diff --git a/fs/dax.c b/fs/dax.c index db78bae0dc0f8d..85930c2a2749fc 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -342,7 +342,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } @@ -356,7 +356,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 7ced8a70cc8b47..2d1b9b8be983a4 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -249,20 +249,6 @@ static inline int radix_tree_exception(void *arg) return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); } -/** - * radix_tree_replace_slot - replace item in a slot - * @pslot: pointer to slot, returned by radix_tree_lookup_slot - * @item: new item to store in the slot. - * - * For use with radix_tree_lookup_slot(). Caller must hold tree write locked - * across slot lookup and replacement. - */ -static inline void radix_tree_replace_slot(void **pslot, void *item) -{ - BUG_ON(radix_tree_is_internal_node(item)); - rcu_assign_pointer(*pslot, item); -} - int __radix_tree_create(struct radix_tree_root *root, unsigned long index, unsigned order, struct radix_tree_node **nodep, void ***slotp); @@ -280,6 +266,8 @@ void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void **slot, void *item); +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item); bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 7885796d35ae74..f91d5b0af65429 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -753,19 +753,10 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); -/** - * __radix_tree_replace - replace item in a slot - * @root: radix tree root - * @node: pointer to tree node - * @slot: pointer to slot in @node - * @item: new item to store in the slot. - * - * For use with __radix_tree_lookup(). Caller must hold tree write locked - * across slot lookup and replacement. - */ -void __radix_tree_replace(struct radix_tree_root *root, - struct radix_tree_node *node, - void **slot, void *item) +static void replace_slot(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + bool warn_typeswitch) { void *old = rcu_dereference_raw(*slot); int exceptional; @@ -776,7 +767,7 @@ void __radix_tree_replace(struct radix_tree_root *root, exceptional = !!radix_tree_exceptional_entry(item) - !!radix_tree_exceptional_entry(old); - WARN_ON_ONCE(exceptional && !node && slot != (void **)&root->rnode); + WARN_ON_ONCE(warn_typeswitch && exceptional); if (node) node->exceptional += exceptional; @@ -784,6 +775,50 @@ void __radix_tree_replace(struct radix_tree_root *root, rcu_assign_pointer(*slot, item); } +/** + * __radix_tree_replace - replace item in a slot + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * + * For use with __radix_tree_lookup(). Caller must hold tree write locked + * across slot lookup and replacement. + */ +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item) +{ + /* + * This function supports replacing exceptional entries, but + * that needs accounting against the node unless the slot is + * root->rnode. + */ + replace_slot(root, node, slot, item, + !node && slot != (void **)&root->rnode); +} + +/** + * radix_tree_replace_slot - replace item in a slot + * @root: radix tree root + * @slot: pointer to slot + * @item: new item to store in the slot. + * + * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), + * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked + * across slot lookup and replacement. + * + * NOTE: This cannot be used to switch between non-entries (empty slots), + * regular entries, and exceptional entries, as that requires accounting + * inside the radix tree node. When switching from one type of entry to + * another, use __radix_tree_lookup() and __radix_tree_replace(). + */ +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item) +{ + replace_slot(root, NULL, slot, item, true); +} + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root diff --git a/mm/filemap.c b/mm/filemap.c index caa779f8797fcb..1ba726aef708ae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -147,7 +147,7 @@ static int page_cache_tree_insert(struct address_space *mapping, false); } } - radix_tree_replace_slot(slot, page); + radix_tree_replace_slot(&mapping->page_tree, slot, page); mapping->nrpages++; if (node) { workingset_node_pages_inc(node); @@ -196,7 +196,7 @@ static void page_cache_tree_delete(struct address_space *mapping, shadow = NULL; } - radix_tree_replace_slot(slot, shadow); + radix_tree_replace_slot(&mapping->page_tree, slot, shadow); if (!node) break; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5d7c006373d305..7a50c726c5ae13 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1426,7 +1426,7 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(slot, + radix_tree_replace_slot(&mapping->page_tree, slot, new_page + (index % HPAGE_PMD_NR)); slot = radix_tree_iter_next(&iter); @@ -1538,7 +1538,8 @@ static void collapse_shmem(struct mm_struct *mm, /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(slot, page); + radix_tree_replace_slot(&mapping->page_tree, + slot, page); spin_unlock_irq(&mapping->tree_lock); putback_lru_page(page); unlock_page(page); diff --git a/mm/migrate.c b/mm/migrate.c index 66ce6b490b13a9..0ed24b1fa77b89 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); /* * Drop cache reference from old page by unfreezing @@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); page_ref_unfreeze(page, expected_count - 1); diff --git a/mm/truncate.c b/mm/truncate.c index 8d8c62d89e6d10..3c631c35787320 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -49,7 +49,7 @@ static void clear_exceptional_entry(struct address_space *mapping, goto unlock; if (*slot != entry) goto unlock; - radix_tree_replace_slot(slot, NULL); + radix_tree_replace_slot(&mapping->page_tree, slot, NULL); mapping->nrexceptional--; if (!node) goto unlock; diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 05d7bc4889711d..d1be94667a30cd 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -146,7 +146,7 @@ static void multiorder_check(unsigned long index, int order) slot = radix_tree_lookup_slot(&tree, index); free(*slot); - radix_tree_replace_slot(slot, item2); + radix_tree_replace_slot(&tree, slot, item2); for (i = min; i < max; i++) { struct item *item = item_lookup(&tree, i); assert(item != 0); -- cgit 1.2.3-korg From f4b109c6dad54257eca837f9dd16a23f2eeab832 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:46 -0800 Subject: lib: radix-tree: add entry deletion support to __radix_tree_replace() Page cache shadow entry handling will be a lot simpler when it can use a single generic replacement function for pages, shadow entries, and emptying slots. Make __radix_tree_replace() properly account insertions and deletions in node->count and garbage collect nodes as they become empty. Then re-implement radix_tree_delete() on top of it. Link: http://lkml.kernel.org/r/20161117193058.GC23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 227 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 116 insertions(+), 111 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index f91d5b0af65429..5d8930f3b3d867 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -538,6 +538,107 @@ static int radix_tree_extend(struct radix_tree_root *root, return maxshift + RADIX_TREE_MAP_SHIFT; } +/** + * radix_tree_shrink - shrink radix tree to minimum height + * @root radix tree root + */ +static inline bool radix_tree_shrink(struct radix_tree_root *root) +{ + bool shrunk = false; + + for (;;) { + struct radix_tree_node *node = root->rnode; + struct radix_tree_node *child; + + if (!radix_tree_is_internal_node(node)) + break; + node = entry_to_node(node); + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost slot, or the child is a multiorder + * entry, we cannot shrink. + */ + if (node->count != 1) + break; + child = node->slots[0]; + if (!child) + break; + if (!radix_tree_is_internal_node(child) && node->shift) + break; + + if (radix_tree_is_internal_node(child)) + entry_to_node(child)->parent = NULL; + + /* + * We don't need rcu_assign_pointer(), since we are simply + * moving the node from one part of the tree to another: if it + * was safe to dereference the old pointer to it + * (node->slots[0]), it will be safe to dereference the new + * one (root->rnode) as far as dependent read barriers go. + */ + root->rnode = child; + + /* + * We have a dilemma here. The node's slot[0] must not be + * NULLed in case there are concurrent lookups expecting to + * find the item. However if this was a bottom-level node, + * then it may be subject to the slot pointer being visible + * to callers dereferencing it. If item corresponding to + * slot[0] is subsequently deleted, these callers would expect + * their slot to become empty sooner or later. + * + * For example, lockless pagecache will look up a slot, deref + * the page pointer, and if the page has 0 refcount it means it + * was concurrently deleted from pagecache so try the deref + * again. Fortunately there is already a requirement for logic + * to retry the entire slot lookup -- the indirect pointer + * problem (replacing direct root node with an indirect pointer + * also results in a stale slot). So tag the slot as indirect + * to force callers to retry. + */ + if (!radix_tree_is_internal_node(child)) + node->slots[0] = RADIX_TREE_RETRY; + + radix_tree_node_free(node); + shrunk = true; + } + + return shrunk; +} + +static bool delete_node(struct radix_tree_root *root, + struct radix_tree_node *node) +{ + bool deleted = false; + + do { + struct radix_tree_node *parent; + + if (node->count) { + if (node == entry_to_node(root->rnode)) + deleted |= radix_tree_shrink(root); + return deleted; + } + + parent = node->parent; + if (parent) { + parent->slots[node->offset] = NULL; + parent->count--; + } else { + root_tag_clear_all(root); + root->rnode = NULL; + } + + radix_tree_node_free(node); + deleted = true; + + node = parent; + } while (node); + + return deleted; +} + /** * __radix_tree_create - create a slot in a radix tree * @root: radix tree root @@ -759,18 +860,20 @@ static void replace_slot(struct radix_tree_root *root, bool warn_typeswitch) { void *old = rcu_dereference_raw(*slot); - int exceptional; + int count, exceptional; WARN_ON_ONCE(radix_tree_is_internal_node(item)); - WARN_ON_ONCE(!!item - !!old); + count = !!item - !!old; exceptional = !!radix_tree_exceptional_entry(item) - !!radix_tree_exceptional_entry(old); - WARN_ON_ONCE(warn_typeswitch && exceptional); + WARN_ON_ONCE(warn_typeswitch && (count || exceptional)); - if (node) + if (node) { + node->count += count; node->exceptional += exceptional; + } rcu_assign_pointer(*slot, item); } @@ -790,12 +893,14 @@ void __radix_tree_replace(struct radix_tree_root *root, void **slot, void *item) { /* - * This function supports replacing exceptional entries, but - * that needs accounting against the node unless the slot is - * root->rnode. + * This function supports replacing exceptional entries and + * deleting entries, but that needs accounting against the + * node unless the slot is root->rnode. */ replace_slot(root, node, slot, item, !node && slot != (void **)&root->rnode); + + delete_node(root, node); } /** @@ -810,8 +915,8 @@ void __radix_tree_replace(struct radix_tree_root *root, * * NOTE: This cannot be used to switch between non-entries (empty slots), * regular entries, and exceptional entries, as that requires accounting - * inside the radix tree node. When switching from one type of entry to - * another, use __radix_tree_lookup() and __radix_tree_replace(). + * inside the radix tree node. When switching from one type of entry or + * deleting, use __radix_tree_lookup() and __radix_tree_replace(). */ void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item) @@ -1466,75 +1571,6 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) } #endif /* CONFIG_SHMEM && CONFIG_SWAP */ -/** - * radix_tree_shrink - shrink radix tree to minimum height - * @root radix tree root - */ -static inline bool radix_tree_shrink(struct radix_tree_root *root) -{ - bool shrunk = false; - - for (;;) { - struct radix_tree_node *node = root->rnode; - struct radix_tree_node *child; - - if (!radix_tree_is_internal_node(node)) - break; - node = entry_to_node(node); - - /* - * The candidate node has more than one child, or its child - * is not at the leftmost slot, or the child is a multiorder - * entry, we cannot shrink. - */ - if (node->count != 1) - break; - child = node->slots[0]; - if (!child) - break; - if (!radix_tree_is_internal_node(child) && node->shift) - break; - - if (radix_tree_is_internal_node(child)) - entry_to_node(child)->parent = NULL; - - /* - * We don't need rcu_assign_pointer(), since we are simply - * moving the node from one part of the tree to another: if it - * was safe to dereference the old pointer to it - * (node->slots[0]), it will be safe to dereference the new - * one (root->rnode) as far as dependent read barriers go. - */ - root->rnode = child; - - /* - * We have a dilemma here. The node's slot[0] must not be - * NULLed in case there are concurrent lookups expecting to - * find the item. However if this was a bottom-level node, - * then it may be subject to the slot pointer being visible - * to callers dereferencing it. If item corresponding to - * slot[0] is subsequently deleted, these callers would expect - * their slot to become empty sooner or later. - * - * For example, lockless pagecache will look up a slot, deref - * the page pointer, and if the page has 0 refcount it means it - * was concurrently deleted from pagecache so try the deref - * again. Fortunately there is already a requirement for logic - * to retry the entire slot lookup -- the indirect pointer - * problem (replacing direct root node with an indirect pointer - * also results in a stale slot). So tag the slot as indirect - * to force callers to retry. - */ - if (!radix_tree_is_internal_node(child)) - node->slots[0] = RADIX_TREE_RETRY; - - radix_tree_node_free(node); - shrunk = true; - } - - return shrunk; -} - /** * __radix_tree_delete_node - try to free node after clearing a slot * @root: radix tree root @@ -1549,33 +1585,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - bool deleted = false; - - do { - struct radix_tree_node *parent; - - if (node->count) { - if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root); - return deleted; - } - - parent = node->parent; - if (parent) { - parent->slots[node->offset] = NULL; - parent->count--; - } else { - root_tag_clear_all(root); - root->rnode = NULL; - } - - radix_tree_node_free(node); - deleted = true; - - node = parent; - } while (node); - - return deleted; + return delete_node(root, node); } static inline void delete_sibling_entries(struct radix_tree_node *node, @@ -1632,12 +1642,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, node_tag_clear(root, node, tag, offset); delete_sibling_entries(node, node_to_entry(slot), offset); - node->slots[offset] = NULL; - node->count--; - if (radix_tree_exceptional_entry(entry)) - node->exceptional--; - - __radix_tree_delete_node(root, node); + __radix_tree_replace(root, node, slot, NULL); return entry; } -- cgit 1.2.3-korg From 4d693d08607ab319095ec8942909df4b4aebdf66 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:49 -0800 Subject: lib: radix-tree: update callback for changing leaf nodes Support handing __radix_tree_replace() a callback that gets invoked for all leaf nodes that change or get freed as a result of the slot replacement, to assist users tracking nodes with node->private_list. This prepares for putting page cache shadow entries into the radix tree root again and drastically simplifying the shadow tracking. Link: http://lkml.kernel.org/r/20161117193134.GD23430@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 3 ++- include/linux/radix-tree.h | 4 +++- lib/radix-tree.c | 42 +++++++++++++++++++++++++++++------------- mm/shmem.c | 3 ++- 4 files changed, 36 insertions(+), 16 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/fs/dax.c b/fs/dax.c index 85930c2a2749fc..6916ed37d46318 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -649,7 +649,8 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, ret = __radix_tree_lookup(page_tree, index, &node, &slot); WARN_ON_ONCE(ret != entry); - __radix_tree_replace(page_tree, node, slot, new_entry); + __radix_tree_replace(page_tree, node, slot, + new_entry, NULL, NULL); } if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 2d1b9b8be983a4..15c972ea951003 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -263,9 +263,11 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *); void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, - void **slot, void *item); + void **slot, void *item, + radix_tree_update_node_t update_node, void *private); void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item); bool __radix_tree_delete_node(struct radix_tree_root *root, diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 5d8930f3b3d867..df4ff18dd63c3b 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -325,7 +325,6 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) tag_clear(node, i, 0); node->slots[0] = NULL; - node->count = 0; kmem_cache_free(radix_tree_node_cachep, node); } @@ -542,7 +541,9 @@ static int radix_tree_extend(struct radix_tree_root *root, * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ -static inline bool radix_tree_shrink(struct radix_tree_root *root) +static inline bool radix_tree_shrink(struct radix_tree_root *root, + radix_tree_update_node_t update_node, + void *private) { bool shrunk = false; @@ -597,8 +598,12 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ - if (!radix_tree_is_internal_node(child)) + node->count = 0; + if (!radix_tree_is_internal_node(child)) { node->slots[0] = RADIX_TREE_RETRY; + if (update_node) + update_node(node, private); + } radix_tree_node_free(node); shrunk = true; @@ -608,7 +613,8 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) } static bool delete_node(struct radix_tree_root *root, - struct radix_tree_node *node) + struct radix_tree_node *node, + radix_tree_update_node_t update_node, void *private) { bool deleted = false; @@ -617,7 +623,8 @@ static bool delete_node(struct radix_tree_root *root, if (node->count) { if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root); + deleted |= radix_tree_shrink(root, update_node, + private); return deleted; } @@ -880,17 +887,20 @@ static void replace_slot(struct radix_tree_root *root, /** * __radix_tree_replace - replace item in a slot - * @root: radix tree root - * @node: pointer to tree node - * @slot: pointer to slot in @node - * @item: new item to store in the slot. + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * @update_node: callback for changing leaf nodes + * @private: private data to pass to @update_node * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. */ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, - void **slot, void *item) + void **slot, void *item, + radix_tree_update_node_t update_node, void *private) { /* * This function supports replacing exceptional entries and @@ -900,7 +910,13 @@ void __radix_tree_replace(struct radix_tree_root *root, replace_slot(root, node, slot, item, !node && slot != (void **)&root->rnode); - delete_node(root, node); + if (!node) + return; + + if (update_node) + update_node(node, private); + + delete_node(root, node, update_node, private); } /** @@ -1585,7 +1601,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - return delete_node(root, node); + return delete_node(root, node, NULL, NULL); } static inline void delete_sibling_entries(struct radix_tree_node *node, @@ -1642,7 +1658,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, node_tag_clear(root, node, tag, offset); delete_sibling_entries(node, node_to_entry(slot), offset); - __radix_tree_replace(root, node, slot, NULL); + __radix_tree_replace(root, node, slot, NULL, NULL, NULL); return entry; } diff --git a/mm/shmem.c b/mm/shmem.c index 3149ddee8f557c..abd7403aba41f0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -311,7 +311,8 @@ static int shmem_radix_tree_replace(struct address_space *mapping, return -ENOENT; if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->page_tree, node, pslot, replacement); + __radix_tree_replace(&mapping->page_tree, node, pslot, + replacement, NULL, NULL); return 0; } -- cgit 1.2.3-korg From 14b468791fa955d442f962fdf5207dfd39a131c8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:52 -0800 Subject: mm: workingset: move shadow entry tracking to radix tree exceptional tracking Currently, we track the shadow entries in the page cache in the upper bits of the radix_tree_node->count, behind the back of the radix tree implementation. Because the radix tree code has no awareness of them, we rely on random subtleties throughout the implementation (such as the node->count != 1 check in the shrinking code, which is meant to exclude multi-entry nodes but also happens to skip nodes with only one shadow entry, as that's accounted in the upper bits). This is error prone and has, in fact, caused the bug fixed in d3798ae8c6f3 ("mm: filemap: don't plant shadow entries without radix tree node"). To remove these subtleties, this patch moves shadow entry tracking from the upper bits of node->count to the existing counter for exceptional entries. node->count goes back to being a simple counter of valid entries in the tree node and can be shrunk to a single byte. This vastly simplifies the page cache code. All accounting happens natively inside the radix tree implementation, and maintaining the LRU linkage of shadow nodes is consolidated into a single function in the workingset code that is called for leaf nodes affected by a change in the page cache tree. This also removes the last user of the __radix_delete_node() return value. Eliminate it. Link: http://lkml.kernel.org/r/20161117193211.GE23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 8 ++----- include/linux/swap.h | 34 +--------------------------- lib/radix-tree.c | 25 +++++---------------- mm/filemap.c | 54 +++++--------------------------------------- mm/truncate.c | 21 +++-------------- mm/workingset.c | 56 +++++++++++++++++++++++++++++++++++----------- 6 files changed, 60 insertions(+), 138 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 15c972ea951003..744486057e9e74 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -80,14 +80,10 @@ static inline bool radix_tree_is_internal_node(void *ptr) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) -/* Internally used bits of node->count */ -#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1) -#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) - struct radix_tree_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ - unsigned int count; /* Total entry count */ + unsigned char count; /* Total entry count */ unsigned char exceptional; /* Exceptional entry count */ union { struct { @@ -270,7 +266,7 @@ void __radix_tree_replace(struct radix_tree_root *root, radix_tree_update_node_t update_node, void *private); void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item); -bool __radix_tree_delete_node(struct radix_tree_root *root, +void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); diff --git a/include/linux/swap.h b/include/linux/swap.h index a56523cefb9b7b..09b212d37f1dad 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -246,39 +246,7 @@ struct swap_info_struct { void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); -extern struct list_lru workingset_shadow_nodes; - -static inline unsigned int workingset_node_pages(struct radix_tree_node *node) -{ - return node->count & RADIX_TREE_COUNT_MASK; -} - -static inline void workingset_node_pages_inc(struct radix_tree_node *node) -{ - node->count++; -} - -static inline void workingset_node_pages_dec(struct radix_tree_node *node) -{ - VM_WARN_ON_ONCE(!workingset_node_pages(node)); - node->count--; -} - -static inline unsigned int workingset_node_shadows(struct radix_tree_node *node) -{ - return node->count >> RADIX_TREE_COUNT_SHIFT; -} - -static inline void workingset_node_shadows_inc(struct radix_tree_node *node) -{ - node->count += 1U << RADIX_TREE_COUNT_SHIFT; -} - -static inline void workingset_node_shadows_dec(struct radix_tree_node *node) -{ - VM_WARN_ON_ONCE(!workingset_node_shadows(node)); - node->count -= 1U << RADIX_TREE_COUNT_SHIFT; -} +void workingset_update_node(struct radix_tree_node *node, void *private); /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index df4ff18dd63c3b..9dbfaac05e6c2c 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -541,12 +541,10 @@ static int radix_tree_extend(struct radix_tree_root *root, * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ -static inline bool radix_tree_shrink(struct radix_tree_root *root, +static inline void radix_tree_shrink(struct radix_tree_root *root, radix_tree_update_node_t update_node, void *private) { - bool shrunk = false; - for (;;) { struct radix_tree_node *node = root->rnode; struct radix_tree_node *child; @@ -606,26 +604,20 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, } radix_tree_node_free(node); - shrunk = true; } - - return shrunk; } -static bool delete_node(struct radix_tree_root *root, +static void delete_node(struct radix_tree_root *root, struct radix_tree_node *node, radix_tree_update_node_t update_node, void *private) { - bool deleted = false; - do { struct radix_tree_node *parent; if (node->count) { if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root, update_node, - private); - return deleted; + radix_tree_shrink(root, update_node, private); + return; } parent = node->parent; @@ -638,12 +630,9 @@ static bool delete_node(struct radix_tree_root *root, } radix_tree_node_free(node); - deleted = true; node = parent; } while (node); - - return deleted; } /** @@ -1595,13 +1584,11 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) * After clearing the slot at @index in @node from radix tree * rooted at @root, call this function to attempt freeing the * node and shrinking the tree. - * - * Returns %true if @node was freed, %false otherwise. */ -bool __radix_tree_delete_node(struct radix_tree_root *root, +void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - return delete_node(root, node, NULL, NULL); + delete_node(root, node, NULL, NULL); } static inline void delete_sibling_entries(struct radix_tree_node *node, diff --git a/mm/filemap.c b/mm/filemap.c index 1ba726aef708ae..dc3e5fce0b7b0f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -132,37 +132,19 @@ static int page_cache_tree_insert(struct address_space *mapping, if (!dax_mapping(mapping)) { if (shadowp) *shadowp = p; - if (node) - workingset_node_shadows_dec(node); } else { /* DAX can replace empty locked entry with a hole */ WARN_ON_ONCE(p != (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | RADIX_DAX_ENTRY_LOCK)); - /* DAX accounts exceptional entries as normal pages */ - if (node) - workingset_node_pages_dec(node); /* Wakeup waiters for exceptional entry lock */ dax_wake_mapping_entry_waiter(mapping, page->index, false); } } - radix_tree_replace_slot(&mapping->page_tree, slot, page); + __radix_tree_replace(&mapping->page_tree, node, slot, page, + workingset_update_node, mapping); mapping->nrpages++; - if (node) { - workingset_node_pages_inc(node); - /* - * Don't track node that contains actual pages. - * - * Avoid acquiring the list_lru lock if already - * untracked. The list_empty() test is safe as - * node->private_list is protected by - * mapping->tree_lock. - */ - if (!list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - } return 0; } @@ -185,8 +167,6 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index + i, &node, &slot); - radix_tree_clear_tags(&mapping->page_tree, node, slot); - if (!node) { VM_BUG_ON_PAGE(nr != 1, page); /* @@ -196,33 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping, shadow = NULL; } - radix_tree_replace_slot(&mapping->page_tree, slot, shadow); - - if (!node) - break; - - workingset_node_pages_dec(node); - if (shadow) - workingset_node_shadows_inc(node); - else - if (__radix_tree_delete_node(&mapping->page_tree, node)) - continue; - - /* - * Track node that only contains shadow entries. DAX mappings - * contain no shadow entries and may contain other exceptional - * entries so skip those. - * - * Avoid acquiring the list_lru lock if already tracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!dax_mapping(mapping) && !workingset_node_pages(node) && - list_empty(&node->private_list)) { - node->private_data = mapping; - list_lru_add(&workingset_shadow_nodes, - &node->private_list); - } + radix_tree_clear_tags(&mapping->page_tree, node, slot); + __radix_tree_replace(&mapping->page_tree, node, slot, shadow, + workingset_update_node, mapping); } if (shadow) { diff --git a/mm/truncate.c b/mm/truncate.c index 3c631c35787320..fd97f1dbce290f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -44,28 +44,13 @@ static void clear_exceptional_entry(struct address_space *mapping, * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, - &slot)) + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; - radix_tree_replace_slot(&mapping->page_tree, slot, NULL); + __radix_tree_replace(&mapping->page_tree, node, slot, NULL, + workingset_update_node, mapping); mapping->nrexceptional--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); unlock: spin_unlock_irq(&mapping->tree_lock); } diff --git a/mm/workingset.c b/mm/workingset.c index 98f830897b1ba6..ef556bf1323d4b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -334,18 +335,45 @@ void workingset_activation(struct page *page) * point where they would still be useful. */ -struct list_lru workingset_shadow_nodes; +static struct list_lru shadow_nodes; + +void workingset_update_node(struct radix_tree_node *node, void *private) +{ + struct address_space *mapping = private; + + /* Only regular page cache has shadow entries */ + if (dax_mapping(mapping) || shmem_mapping(mapping)) + return; + + /* + * Track non-empty nodes that contain only shadow entries; + * unlink those that contain pages or are being freed. + * + * Avoid acquiring the list_lru lock when the nodes are + * already where they should be. The list_empty() test is safe + * as node->private_list is protected by &mapping->tree_lock. + */ + if (node->count && node->count == node->exceptional) { + if (list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&shadow_nodes, &node->private_list); + } + } else { + if (!list_empty(&node->private_list)) + list_lru_del(&shadow_nodes, &node->private_list); + } +} static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { - unsigned long shadow_nodes; unsigned long max_nodes; + unsigned long nodes; unsigned long pages; /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); + nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); if (sc->memcg) { @@ -372,10 +400,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, */ max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); - if (shadow_nodes <= max_nodes) + if (nodes <= max_nodes) return 0; - return shadow_nodes - max_nodes; + return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, @@ -418,22 +446,25 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - if (WARN_ON_ONCE(!workingset_node_shadows(node))) + if (WARN_ON_ONCE(!node->exceptional)) goto out_invalid; - if (WARN_ON_ONCE(workingset_node_pages(node))) + if (WARN_ON_ONCE(node->count != node->exceptional)) goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) goto out_invalid; + if (WARN_ON_ONCE(!node->exceptional)) + goto out_invalid; if (WARN_ON_ONCE(!mapping->nrexceptional)) goto out_invalid; node->slots[i] = NULL; - workingset_node_shadows_dec(node); + node->exceptional--; + node->count--; mapping->nrexceptional--; } } - if (WARN_ON_ONCE(workingset_node_shadows(node))) + if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node); @@ -456,8 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, - shadow_lru_isolate, NULL); + ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); local_irq_enable(); return ret; } @@ -496,7 +526,7 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key); if (ret) goto err; ret = register_shrinker(&workingset_shadow_shrinker); @@ -504,7 +534,7 @@ static int __init workingset_init(void) goto err_list_lru; return 0; err_list_lru: - list_lru_destroy(&workingset_shadow_nodes); + list_lru_destroy(&shadow_nodes); err: return ret; } -- cgit 1.2.3-korg