From 1bc545bff45ce9eefc176ccf663074462a209cb6 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 21 Jun 2023 02:31:01 +0000 Subject: mm/vmscan: fix root proactive reclaim unthrottling unbalanced node When memory.reclaim was introduced, it became the first case where cgroup_reclaim() is true for the root cgroup. Johannes concluded [1] that for most cases this is okay, except for one case. Historically, kswapd would throttle reclaim on a node if a lot of pages marked for reclaim are under writeback (aka the node is congested). This occurred by setting LRUVEC_CONGESTED bit in lruvec->flags. The bit would be cleared when the node is balanced. Similarly, cgroup reclaim would set the same bit when an lruvec is congested, and clear it on the way out of reclaim (to throttle local reclaimers). Before the introduction of memory.reclaim, the root memcg was the only target of kswapd reclaim, and non-root memcgs were the only targets of cgroup reclaim, so they would never interfere. Using the same bit for both was fine. After memory.reclaim, it is possible for cgroup reclaim on the root cgroup to clear the bit set by kswapd. This would result in reclaim on the node to be unthrottled before the node is balanced. Fix this by introducing separate bits for cgroup-level and node-level congestion. kswapd can unthrottle an lruvec that is marked as congested by cgroup reclaim (as the entire node should no longer be congested), but not vice versa (to prevent premature unthrottling before the entire node is balanced). [1]https://lore.kernel.org/lkml/20230405200150.GA35884@cmpxchg.org/ Link: https://lkml.kernel.org/r/20230621023101.432780-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reported-by: Johannes Weiner Closes: https://lore.kernel.org/lkml/20230405200150.GA35884@cmpxchg.org/ Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Muchun Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index b7068be8a0349b..1080209a568bba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6578,10 +6578,13 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) * Legacy memcg will stall in page writeback so avoid forcibly * stalling in reclaim_throttle(). */ - if ((current_is_kswapd() || - (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && - sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) { + if (cgroup_reclaim(sc) && writeback_throttling_sane(sc)) + set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags); + + if (current_is_kswapd()) + set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags); + } /* * Stall direct reclaim for IO completions if the lruvec is @@ -6591,7 +6594,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) */ if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && - test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) + (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) || + test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags))) reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc)) @@ -6848,7 +6852,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, zone->zone_pgdat); - clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); } } @@ -7237,7 +7241,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat) { struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); - clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } -- cgit 1.2.3-korg