From 592238c03ef9e44ad6031b671da869ebcbffa8dc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 8 May 2025 17:28:00 -0400
Subject: pnode: lift peers() into pnode.h

it's going to be useful both in pnode.c and namespace.c

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index ffd429b760d5d4..aa187144e3894f 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -216,11 +216,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 static struct mount *last_dest, *first_source, *last_source, *dest_master;
 static struct hlist_head *list;
 
-static inline bool peers(const struct mount *m1, const struct mount *m2)
-{
-	return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
-}
-
 static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 {
 	struct mount *child;
-- 
cgit 1.2.3-korg


From f0d0ba19985d23a3e83d654318ccb6e9c5f1b095 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 14 May 2025 20:50:06 -0400
Subject: Rewrite of propagate_umount()

The variant currently in the tree has problems; trying to prove
correctness has caught at least one class of bugs (reparenting
that ends up moving the visible location of reparented mount, due
to not excluding some of the counterparts on propagation that
should've been included).

I tried to prove that it's the only bug there; I'm still not sure
whether it is.  If anyone can reconstruct and write down an analysis
of the mainline implementation, I'll gladly review it; as it is,
I ended up doing a different implementation.  Candidate collection
phase is similar, but trimming the set down until it satisfies the
constraints turned out pretty different.

I hoped to do transformation as a massage series, but that turns out
to be too convoluted.  So it's a single patch replacing propagate_umount()
and friends in one go, with notes and analysis in D/f/propagate_umount.txt
(in addition to inline comments).

As far I can tell, it is provably correct and provably linear by the number
of mounts we need to look at in order to decide what should be unmounted.
It even builds and seems to survive testing...

Another nice thing that fell out of that is that ->mnt_umounting is no longer
needed.

Compared to the first version:
	* explicit MNT_UMOUNT_CANDIDATE flag for is_candidate()
	* trim_ancestors() only clears that flag, leaving the suckers on list
	* trim_one() and handle_locked() take the stuff with flag cleared off
the list.  That allows to iterate with list_for_each_entry_safe() when calling
trim_one() - it removes at most one element from the list now.
	* no globals - I didn't bother with any kind of context, not worth it.

	* Notes updated accordingly; I have not touch the terms yet.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/propagate_umount.txt | 484 +++++++++++++++++++++++++
 fs/mount.h                                     |   1 -
 fs/namespace.c                                 |   1 -
 fs/pnode.c                                     | 362 +++++++++---------
 fs/pnode.h                                     |   2 +-
 include/linux/mount.h                          |   3 +-
 6 files changed, 685 insertions(+), 168 deletions(-)
 create mode 100644 Documentation/filesystems/propagate_umount.txt

(limited to 'fs/pnode.c')

diff --git a/Documentation/filesystems/propagate_umount.txt b/Documentation/filesystems/propagate_umount.txt
new file mode 100644
index 00000000000000..6906903a8aa29f
--- /dev/null
+++ b/Documentation/filesystems/propagate_umount.txt
@@ -0,0 +1,484 @@
+	Notes on propagate_umount()
+
+Umount propagation starts with a set of mounts we are already going to
+take out.  Ideally, we would like to add all downstream cognates to
+that set - anything with the same mountpoint as one of the removed
+mounts and with parent that would receive events from the parent of that
+mount.  However, there are some constraints the resulting set must
+satisfy.
+
+It is convenient to define several properties of sets of mounts:
+
+1) A set S of mounts is non-shifting if for any mount X belonging
+to S all subtrees mounted strictly inside of X (i.e. not overmounting
+the root of X) contain only elements of S.
+
+2) A set S is non-revealing if all locked mounts that belong to S have
+parents that also belong to S.
+
+3) A set S is closed if it contains all children of its elements.
+
+The set of mounts taken out by umount(2) must be non-shifting and
+non-revealing; the first constraint is what allows to reparent
+any remaining mounts and the second is what prevents the exposure
+of any concealed mountpoints.
+
+propagate_umount() takes the original set as an argument and tries to
+extend that set.  The original set is a full subtree and its root is
+unlocked; what matters is that it's closed and non-revealing.
+Resulting set may not be closed; there might still be mounts outside
+of that set, but only on top of stacks of root-overmounting elements
+of set.  They can be reparented to the place where the bottom of
+stack is attached to a mount that will survive.  NOTE: doing that
+will violate a constraint on having no more than one mount with
+the same parent/mountpoint pair; however, the caller (umount_tree())
+will immediately remedy that - it may keep unmounted element attached
+to parent, but only if the parent itself is unmounted.  Since all
+conflicts created by reparenting have common parent *not* in the
+set and one side of the conflict (bottom of the stack of overmounts)
+is in the set, it will be resolved.  However, we rely upon umount_tree()
+doing that pretty much immediately after the call of propagate_umount().
+
+Algorithm is based on two statements:
+	1) for any set S, there is a maximal non-shifting subset of S
+and it can be calculated in O(#S) time.
+	2) for any non-shifting set S, there is a maximal non-revealing
+subset of S.  That subset is also non-shifting and it can be calculated
+in O(#S) time.
+
+		Finding candidates.
+
+We are given a closed set U and we want to find all mounts that have
+the same mountpoint as some mount m in U *and* whose parent receives
+propagation from the parent of the same mount m.  Naive implementation
+would be
+	S = {}
+	for each m in U
+		add m to S
+		p = parent(m)
+		for each q in Propagation(p) - {p}
+			child = look_up(q, mountpoint(m))
+			if child
+				add child to S
+but that can lead to excessive work - there might be propagation among the
+subtrees of U, in which case we'd end up examining the same candidates
+many times.  Since propagation is transitive, the same will happen to
+everything downstream of that candidate and it's not hard to construct
+cases where the approach above leads to the time quadratic by the actual
+number of candidates.
+
+Note that if we run into a candidate we'd already seen, it must've been
+added on an earlier iteration of the outer loop - all additions made
+during one iteration of the outer loop have different parents.  So
+if we find a child already added to the set, we know that everything
+in Propagation(parent(child)) with the same mountpoint has been already
+added.
+	S = {}
+	for each m in U
+		if m in S
+			continue
+		add m to S
+		p = parent(m)
+		q = propagation_next(p, p)
+		while q
+			child = look_up(q, mountpoint(m))
+			if child
+				if child in S
+					q = skip_them(q, p)
+					continue;
+				add child to S
+			q = propagation_next(q, p)
+where
+skip_them(q, p)
+	keep walking Propagation(p) from q until we find something
+	not in Propagation(q)
+
+would get rid of that problem, but we need a sane implementation of
+skip_them().  That's not hard to do - split propagation_next() into
+"down into mnt_slave_list" and "forward-and-up" parts, with the
+skip_them() being "repeat the forward-and-up part until we get NULL
+or something that isn't a peer of the one we are skipping".
+
+Note that there can be no absolute roots among the extra candidates -
+they all come from mount lookups.  Absolute root among the original
+set is _currently_ impossible, but it might be worth protecting
+against.
+
+		Maximal non-shifting subsets.
+
+Let's call a mount m in a set S forbidden in that set if there is a
+subtree mounted strictly inside m and containing mounts that do not
+belong to S.
+
+The set is non-shifting when none of its elements are forbidden in it.
+
+If mount m is forbidden in a set S, it is forbidden in any subset S' it
+belongs to.  In other words, it can't belong to any of the non-shifting
+subsets of S.  If we had a way to find a forbidden mount or show that
+there's none, we could use it to find the maximal non-shifting subset
+simply by finding and removing them until none remain.
+
+Suppose mount m is forbidden in S; then any mounts forbidden in S - {m}
+must have been forbidden in S itself.  Indeed, since m has descendents
+that do not belong to S, any subtree that fits into S will fit into
+S - {m} as well.
+
+So in principle we could go through elements of S, checking if they
+are forbidden in S and removing the ones that are.  Removals will
+not invalidate the checks done for earlier mounts - if they were not
+forbidden at the time we checked, they won't become forbidden later.
+It's too costly to be practical, but there is a similar approach that
+is linear by size of S.
+
+Let's say that mount x in a set S is forbidden by mount y, if
+	* both x and y belong to S.
+	* there is a chain of mounts starting at x and leaving S
+	  immediately after passing through y, with the first
+	  mountpoint strictly inside x.
+Note 1: x may be equal to y - that's the case when something not
+belonging to S is mounted strictly inside x.
+Note 2: if y does not belong to S, it can't forbid anything in S.
+Note 3: if y has no children outside of S, it can't forbid anything in S.
+
+It's easy to show that mount x is forbidden in S if and only if x is
+forbidden in S by some mount y.  And it's easy to find all mounts in S
+forbidden by a given mount.
+
+Consider the following operation:
+	Trim(S, m) = S - {x : x is forbidden by m in S}
+
+Note that if m does not belong to S or has no children outside of S we
+are guaranteed that Trim(S, m) is equal to S.
+
+The following is true: if x is forbidden by y in Trim(S, m), it was
+already forbidden by y in S.
+
+Proof: Suppose x is forbidden by y in Trim(S, m).  Then there is a
+chain of mounts (x_0 = x, ..., x_k = y, x_{k+1} = r), such that x_{k+1}
+is the first element that doesn't belong to Trim(S, m) and the
+mountpoint of x_1 is strictly inside x.  If mount r belongs to S, it must
+have been removed by Trim(S, m), i.e. it was forbidden in S by m.
+Then there was a mount chain from r to some child of m that stayed in
+S all the way until m, but that's impossible since x belongs to Trim(S, m)
+and prepending (x_0, ..., x_k) to that chain demonstrates that x is also
+forbidden in S by m, and thus can't belong to Trim(S, m).
+Therefore r can not belong to S and our chain demonstrates that
+x is forbidden by y in S.  QED.
+
+Corollary: no mount is forbidden by m in Trim(S, m).  Indeed, any
+such mount would have been forbidden by m in S and thus would have been
+in the part of S removed in Trim(S, m).
+
+Corollary: no mount is forbidden by m in Trim(Trim(S, m), n).  Indeed,
+any such would have to have been forbidden by m in Trim(S, m), which
+is impossible.
+
+Corollary: after
+	S = Trim(S, x_1)
+	S = Trim(S, x_2)
+	...
+	S = Trim(S, x_k)
+no mount remaining in S will be forbidden by either of x_1,...,x_k.
+
+The following will reduce S to its maximal non-shifting subset:
+	visited = {}
+	while S contains elements not belonging to visited
+		let m be an arbitrary such element of S
+		S = Trim(S, m)
+		add m to visited
+
+S never grows, so the number of elements of S not belonging to visited
+decreases at least by one on each iteration.  When the loop terminates,
+all mounts remaining in S belong to visited.  It's easy to see that at
+the beginning of each iteration no mount remaining in S will be forbidden
+by any element of visited.  In other words, no mount remaining in S will
+be forbidden, i.e. final value of S will be non-shifting.  It will be
+the maximal non-shifting subset, since we were removing only forbidden
+elements.
+
+	There are two difficulties in implementing the above in linear
+time, both due to the fact that Trim() might need to remove more than one
+element.  Naive implementation of Trim() is vulnerable to running into a
+long chain of mounts, each mounted on top of parent's root.  Nothing in
+that chain is forbidden, so nothing gets removed from it.  We need to
+recognize such chains and avoid walking them again on subsequent calls of
+Trim(), otherwise we will end up with worst-case time being quadratic by
+the number of elements in S.  Another difficulty is in implementing the
+outer loop - we need to iterate through all elements of a shrinking set.
+That would be trivial if we never removed more than one element at a time
+(linked list, with list_for_each_entry_safe for iterator), but we may
+need to remove more than one entry, possibly including the ones we have
+already visited.
+
+	Let's start with naive algorithm for Trim():
+
+Trim_one(m)
+	found = false
+	for each n in children(m)
+		if n not in S
+			found = true
+			if (mountpoint(n) != root(m))
+				remove m from S
+				break
+	if found
+		Trim_ancestors(m)
+
+Trim_ancestors(m)
+	for (; parent(m) in S; m = parent(m)) {
+		if (mountpoint(m) != root(parent(m)))
+			remove parent(m) from S
+	}
+
+If m belongs to S, Trim_one(m) will replace S with Trim(S, m).
+Proof:
+	Consider the chains excluding elements from Trim(S, m).  The last
+two elements in such chain are m and some child of m that does not belong
+to S.  If m has no such children, Trim(S, m) is equal to S.
+	m itself is removed if and only if the chain has exactly two
+elements, i.e. when the last element does not overmount the root of m.
+In other words, that happens when m has a child not in S that does not
+overmount the root of m.
+	All other elements to remove will be ancestors of m, such that
+the entire descent chain from them to m is contained in S.  Let
+(x_0, x_1, ..., x_k = m) be the longest such chain.  x_i needs to be
+removed if and only if x_{i+1} does not overmount its root.  It's easy
+to see that Trim_ancestors(m) will iterate through that chain from
+x_k to x_1 and that it will remove exactly the elements that need to be
+removed.
+
+	Note that if the loop in Trim_ancestors() walks into an already
+visited element, we are guaranteed that remaining iterations will see
+only elements that had already been visited and remove none of them.
+That's the weakness that makes it vulnerable to long chains of full
+overmounts.
+
+	It's easy to deal with, if we can afford setting marks on
+elements of S; we would mark all elements already visited by
+Trim_ancestors() and have it bail out as soon as it sees an already
+marked element.
+
+	The problems with iterating through the set can be dealt with in
+several ways, depending upon the representation we choose for our set.
+One useful observation is that we are given a closed subset in S - the
+original set passed to propagate_umount().  Its elements can neither
+forbid anything nor be forbidden by anything - all their descendents
+belong to S, so they can not occur anywhere in any excluding chain.
+In other words, the elements of that subset will remain in S until
+the end and Trim_one(S, m) is a no-op for all m from that subset.
+
+	That suggests keeping S as a disjoint union of a closed set U
+('will be unmounted, no matter what') and the set of all elements of
+S that do not belong to U.  That set ('candidates') is all we need
+to iterate through.  Let's represent it as a subset in a cyclic list,
+consisting of all list elements that are marked as candidates (initially -
+all of them).  Then we could have Trim_ancestors() only remove the mark,
+leaving the elements on the list.  Then Trim_one() would never remove
+anything other than its argument from the containing list, allowing to
+use list_for_each_entry_safe() as iterator.
+
+	Assuming that representation we get the following:
+
+	list_for_each_entry_safe(m, ..., Candidates, ...)
+		Trim_one(m)
+where
+Trim_one(m)
+	if (m is not marked as a candidate)
+		strip the "seen by Trim_ancestors" mark from m
+		remove m from the Candidates list
+		return
+		
+	remove_this = false
+	found = false
+	for each n in children(m)
+		if n not in S
+			found = true
+			if (mountpoint(n) != root(m))
+				remove_this = true
+				break
+	if found
+		Trim_ancestors(m)
+	if remove_this
+		strip the "seen by Trim_ancestors" mark from m
+		strip the "candidate" mark from m
+		remove m from the Candidate list
+
+Trim_ancestors(m)
+	for (p = parent(m); p is marked as candidate ; m = p, p = parent(p)) {
+		if m is marked as seen by Trim_ancestors
+			return
+		mark m as seen by Trim_ancestors
+		if (mountpoint(m) != root(p))
+			strip the "candidate" mark from p
+	}
+
+	Terminating condition in the loop in Trim_ancestors() is correct,
+since that that loop will never run into p belonging to U - p is always
+an ancestor of argument of Trim_one() and since U is closed, the argument
+of Trim_one() would also have to belong to U.  But Trim_one() is never
+called for elements of U.  In other words, p belongs to S if and only
+if it belongs to candidates.
+
+	Time complexity:
+* we get no more than O(#S) calls of Trim_one()
+* the loop over children in Trim_one() never looks at the same child
+twice through all the calls.
+* iterations of that loop for children in S are no more than O(#S)
+in the worst case
+* at most two children that are not elements of S are considered per
+call of Trim_one().
+* the loop in Trim_ancestors() sets its mark once per iteration and
+no element of S has is set more than once.
+
+	In the end we may have some elements excluded from S by
+Trim_ancestors() still stuck on the list.  We could do a separate
+loop removing them from the list (also no worse than O(#S) time),
+but it's easier to leave that until the next phase - there we will
+iterate through the candidates anyway.
+
+	The caller has already removed all elements of U from their parents'
+lists of children, which means that checking if child belongs to S is
+equivalent to checking if it's marked as a candidate; we'll never see
+the elements of U in the loop over children in Trim_one().
+
+	What's more, if we see that children(m) is empty and m is not
+locked, we can immediately move m into the committed subset (remove
+from the parent's list of children, etc.).  That's one fewer mount we'll
+have to look into when we check the list of children of its parent *and*
+when we get to building the non-revealing subset.
+
+		Maximal non-revealing subsets
+
+If S is not a non-revealing subset, there is a locked element x in S
+such that parent of x is not in S.
+
+Obviously, no non-revealing subset of S may contain x.  Removing such
+elements one by one will obviously end with the maximal non-revealing
+subset (possibly empty one).  Note that removal of an element will
+require removal of all its locked children, etc.
+
+If the set had been non-shifting, it will remain non-shifting after
+such removals.
+Proof: suppose S was non-shifting, x is a locked element of S, parent of x
+is not in S and S - {x} is not non-shifting.  Then there is an element m
+in S - {x} and a subtree mounted strictly inside m, such that m contains
+an element not in in S - {x}.  Since S is non-shifting, everything in
+that subtree must belong to S.  But that means that this subtree must
+contain x somewhere *and* that parent of x either belongs that subtree
+or is equal to m.  Either way it must belong to S.  Contradiction.
+
+// same representation as for finding maximal non-shifting subsets:
+// S is a disjoint union of a non-revealing set U (the ones we are committed
+// to unmount) and a set of candidates, represented as a subset of list
+// elements that have "is a candidate" mark on them.
+// Elements of U are removed from their parents' lists of children.
+// In the end candidates becomes empty and maximal non-revealing non-shifting
+// subset of S is now in U
+	while (Candidates list is non-empty)
+		handle_locked(first(Candidates))
+
+handle_locked(m)
+	if m is not marked as a candidate
+		strip the "seen by Trim_ancestors" mark from m
+		remove m from the list
+		return
+	cutoff = m
+	for (p = m; p in candidates; p = parent(p)) {
+		strip the "seen by Trim_ancestors" mark from p
+		strip the "candidate" mark from p
+		remove p from the Candidates list
+		if (!locked(p))
+			cutoff = parent(p)
+	}
+	if p in U
+		cutoff = p
+	while m != cutoff
+		remove m from children(parent(m))
+		add m to U
+		m = parent(m)
+
+Let (x_0, ..., x_n = m) be the maximal chain of descent of m within S.
+* If it contains some elements of U, let x_k be the last one of those.
+Then union of U with {x_{k+1}, ..., x_n} is obviously non-revealing.
+* otherwise if all its elements are locked, then none of {x_0, ..., x_n}
+may be elements of a non-revealing subset of S.
+* otherwise let x_k be the first unlocked element of the chain.  Then none
+of {x_0, ..., x_{k-1}} may be an element of a non-revealing subset of
+S and union of U and {x_k, ..., x_n} is non-revealing.
+
+handle_locked(m) finds which of these cases applies and adjusts Candidates
+and U accordingly.  U remains non-revealing, union of Candidates and
+U still contains any non-revealing subset of S and after the call of
+handle_locked(m) m is guaranteed to be not in Candidates list.  So having
+it called for each element of S would suffice to empty Candidates,
+leaving U the maximal non-revealing subset of S.
+
+However, handle_locked(m) is a no-op when m belongs to U, so it's enough
+to have it called for elements of Candidates list until none remain.
+
+Time complexity: number of calls of handle_locked() is limited by
+#Candidates, each iteration of the first loop in handle_locked() removes
+an element from the list, so their total number of executions is also
+limited by #Candidates; number of iterations in the second loop is no
+greater than the number of iterations of the first loop.
+
+
+		Reparenting
+
+After we'd calculated the final set, we still need to deal with
+reparenting - if an element of the final set has a child not in it,
+we need to reparent such child.
+
+Such children can only be root-overmounting (otherwise the set wouldn't
+be non-shifting) and their parents can not belong to the original set,
+since the original is guaranteed to be closed.
+
+
+		Putting all of that together
+
+The plan is to
+	* find all candidates
+	* trim down to maximal non-shifting subset
+	* trim down to maximal non-revealing subset
+	* reparent anything that needs to be reparented
+	* return the resulting set to the caller
+
+For the 2nd and 3rd steps we want to separate the set into growing
+non-revealing subset, initially containing the original set ("U" in
+terms of the pseudocode above) and everything we are still not sure about
+("candidates").  It means that for the output of the 1st step we'd like
+the extra candidates separated from the stuff already in the original set.
+For the 4th step we would like the additions to U separate from the
+original set.
+
+So let's go for
+	* original set ("set").  Linkage via mnt_list
+	* undecided candidates ("candidates").  Subset of a list,
+consisting of all its elements marked with a new flag (MNT_UMOUNT_CANDIDATE).
+Initially all elements of the list will be marked that way; in the
+end the list will become empty and no mounts will remain marked with
+that flag.
+	* Reuse MNT_MARKED for "has been already seen by trim_ancestors()".
+	* anything in U that hadn't been in the original set - elements of
+candidates will gradually be either discarded or moved there.  In other
+words, it's the candidates we have already decided to unmount.	Its role
+is reasonably close to the old "to_umount", so let's use that name.
+Linkage via mnt_list.
+
+For gather_candidates() we'll need to maintain both candidates (S -
+set) and intersection of S with set.  Use MNT_UMOUNT_CANDIDATE for
+all elements we encounter, putting the ones not already in the original
+set into the list of candidates.  When we are done, strip that flag from
+all elements of the original set.  That gives a cheap way to check
+if element belongs to S (in gather_candidates) and to candidates
+itself (at later stages).  Call that predicate is_candidate(); it would
+be m->mnt_flags & MNT_UMOUNT_CANDIDATE.
+
+All elements of the original set are marked with MNT_UMOUNT and we'll
+need the same for elements added when joining the contents of to_umount
+to set in the end.  Let's set MNT_UMOUNT at the time we add an element
+to to_umount; that's close to what the old 'umount_one' is doing, so
+let's keep that name.  It also gives us another predicate we need -
+"belongs to union of set and to_umount"; will_be_unmounted() for now.
+
+Removals from the candidates list should strip both MNT_MARKED and
+MNT_UMOUNT_CANDIDATE; call it remove_from_candidates_list().
diff --git a/fs/mount.h b/fs/mount.h
index f20e6ed845fec0..fb93d3e16724dd 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -79,7 +79,6 @@ struct mount {
 		struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
 		struct hlist_node mnt_umount;
 	};
-	struct list_head mnt_umounting; /* list entry for umount propagation */
 #ifdef CONFIG_FSNOTIFY
 	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
diff --git a/fs/namespace.c b/fs/namespace.c
index 6a0697eeda7426..f64895d47d7055 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -383,7 +383,6 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
-		INIT_LIST_HEAD(&mnt->mnt_umounting);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
 		RB_CLEAR_NODE(&mnt->mnt_node);
 		mnt->mnt.mnt_idmap = &nop_mnt_idmap;
diff --git a/fs/pnode.c b/fs/pnode.c
index aa187144e3894f..901d40946d3414 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -24,11 +24,6 @@ static inline struct mount *first_slave(struct mount *p)
 	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
 }
 
-static inline struct mount *last_slave(struct mount *p)
-{
-	return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
-}
-
 static inline struct mount *next_slave(struct mount *p)
 {
 	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
@@ -136,6 +131,23 @@ void change_mnt_propagation(struct mount *mnt, int type)
 	}
 }
 
+static struct mount *__propagation_next(struct mount *m,
+					 struct mount *origin)
+{
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			return (next == origin) ? NULL : next;
+		} else if (m->mnt_slave.next != &master->mnt_slave_list)
+			return next_slave(m);
+
+		/* back at master */
+		m = master;
+	}
+}
+
 /*
  * get the next mount in the propagation tree.
  * @m: the mount seen last
@@ -153,31 +165,21 @@ static struct mount *propagation_next(struct mount *m,
 	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
 		return first_slave(m);
 
-	while (1) {
-		struct mount *master = m->mnt_master;
-
-		if (master == origin->mnt_master) {
-			struct mount *next = next_peer(m);
-			return (next == origin) ? NULL : next;
-		} else if (m->mnt_slave.next != &master->mnt_slave_list)
-			return next_slave(m);
-
-		/* back at master */
-		m = master;
-	}
+	return __propagation_next(m, origin);
 }
 
 static struct mount *skip_propagation_subtree(struct mount *m,
 						struct mount *origin)
 {
 	/*
-	 * Advance m such that propagation_next will not return
-	 * the slaves of m.
+	 * Advance m past everything that gets propagation from it.
 	 */
-	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
-		m = last_slave(m);
+	struct mount *p = __propagation_next(m, origin);
+
+	while (p && peers(m, p))
+		p = __propagation_next(p, origin);
 
-	return m;
+	return p;
 }
 
 static struct mount *next_group(struct mount *m, struct mount *origin)
@@ -458,181 +460,213 @@ void propagate_mount_unlock(struct mount *mnt)
 	}
 }
 
-static void umount_one(struct mount *mnt, struct list_head *to_umount)
+static inline bool is_candidate(struct mount *m)
 {
-	CLEAR_MNT_MARK(mnt);
-	mnt->mnt.mnt_flags |= MNT_UMOUNT;
-	list_del_init(&mnt->mnt_child);
-	list_del_init(&mnt->mnt_umounting);
-	move_from_ns(mnt, to_umount);
+	return m->mnt.mnt_flags & MNT_UMOUNT_CANDIDATE;
 }
 
-/*
- * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
- * parent propagates to.
- */
-static bool __propagate_umount(struct mount *mnt,
-			       struct list_head *to_umount,
-			       struct list_head *to_restore)
+static inline bool will_be_unmounted(struct mount *m)
 {
-	bool progress = false;
-	struct mount *child;
+	return m->mnt.mnt_flags & MNT_UMOUNT;
+}
 
-	/*
-	 * The state of the parent won't change if this mount is
-	 * already unmounted or marked as without children.
-	 */
-	if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
-		goto out;
+static void umount_one(struct mount *m, struct list_head *to_umount)
+{
+	m->mnt.mnt_flags |= MNT_UMOUNT;
+	list_del_init(&m->mnt_child);
+	move_from_ns(m, to_umount);
+}
 
-	/* Verify topper is the only grandchild that has not been
-	 * speculatively unmounted.
-	 */
-	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
-		if (child->mnt_mountpoint == mnt->mnt.mnt_root)
-			continue;
-		if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
-			continue;
-		/* Found a mounted child */
-		goto children;
-	}
+static void remove_from_candidate_list(struct mount *m)
+{
+	m->mnt.mnt_flags &= ~(MNT_MARKED | MNT_UMOUNT_CANDIDATE);
+	list_del_init(&m->mnt_list);
+}
 
-	/* Mark mounts that can be unmounted if not locked */
-	SET_MNT_MARK(mnt);
-	progress = true;
+static void gather_candidates(struct list_head *set,
+			      struct list_head *candidates)
+{
+	struct mount *m, *p, *q;
 
-	/* If a mount is without children and not locked umount it. */
-	if (!IS_MNT_LOCKED(mnt)) {
-		umount_one(mnt, to_umount);
-	} else {
-children:
-		list_move_tail(&mnt->mnt_umounting, to_restore);
+	list_for_each_entry(m, set, mnt_list) {
+		if (is_candidate(m))
+			continue;
+		m->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
+		p = m->mnt_parent;
+		q = propagation_next(p, p);
+		while (q) {
+			struct mount *child = __lookup_mnt(&q->mnt,
+							   m->mnt_mountpoint);
+			if (child) {
+				/*
+				 * We might've already run into this one.  That
+				 * must've happened on earlier iteration of the
+				 * outer loop; in that case we can skip those
+				 * parents that get propagation from q - there
+				 * will be nothing new on those as well.
+				 */
+				if (is_candidate(child)) {
+					q = skip_propagation_subtree(q, p);
+					continue;
+				}
+				child->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
+				if (!will_be_unmounted(child))
+					list_add(&child->mnt_list, candidates);
+			}
+			q = propagation_next(q, p);
+		}
 	}
-out:
-	return progress;
+	list_for_each_entry(m, set, mnt_list)
+		m->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
 }
 
-static void umount_list(struct list_head *to_umount,
-			struct list_head *to_restore)
+/*
+ * We know that some child of @m can't be unmounted.  In all places where the
+ * chain of descent of @m has child not overmounting the root of parent,
+ * the parent can't be unmounted either.
+ */
+static void trim_ancestors(struct mount *m)
 {
-	struct mount *mnt, *child, *tmp;
-	list_for_each_entry(mnt, to_umount, mnt_list) {
-		list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
-			/* topper? */
-			if (child->mnt_mountpoint == mnt->mnt.mnt_root)
-				list_move_tail(&child->mnt_umounting, to_restore);
-			else
-				umount_one(child, to_umount);
-		}
+	struct mount *p;
+
+	for (p = m->mnt_parent; is_candidate(p); m = p, p = p->mnt_parent) {
+		if (IS_MNT_MARKED(m))	// all candidates beneath are overmounts
+			return;
+		SET_MNT_MARK(m);
+		if (m != p->overmount)
+			p->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
 	}
 }
 
-static void restore_mounts(struct list_head *to_restore)
+/*
+ * Find and exclude all umount candidates forbidden by @m
+ * (see Documentation/filesystems/propagate_umount.txt)
+ * If we can immediately tell that @m is OK to unmount (unlocked
+ * and all children are already committed to unmounting) commit
+ * to unmounting it.
+ * Only @m itself might be taken from the candidates list;
+ * anything found by trim_ancestors() is marked non-candidate
+ * and left on the list.
+ */
+static void trim_one(struct mount *m, struct list_head *to_umount)
 {
-	/* Restore mounts to a clean working state */
-	while (!list_empty(to_restore)) {
-		struct mount *mnt, *parent;
-		struct mountpoint *mp;
-
-		mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
-		CLEAR_MNT_MARK(mnt);
-		list_del_init(&mnt->mnt_umounting);
-
-		/* Should this mount be reparented? */
-		mp = mnt->mnt_mp;
-		parent = mnt->mnt_parent;
-		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
-			mp = parent->mnt_mp;
-			parent = parent->mnt_parent;
-		}
-		if (parent != mnt->mnt_parent) {
-			mnt_change_mountpoint(parent, mp, mnt);
-			mnt_notify_add(mnt);
+	bool remove_this = false, found = false, umount_this = false;
+	struct mount *n;
+
+	if (!is_candidate(m)) { // trim_ancestors() left it on list
+		remove_from_candidate_list(m);
+		return;
+	}
+
+	list_for_each_entry(n, &m->mnt_mounts, mnt_child) {
+		if (!is_candidate(n)) {
+			found = true;
+			if (n != m->overmount) {
+				remove_this = true;
+				break;
+			}
 		}
 	}
+	if (found) {
+		trim_ancestors(m);
+	} else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) {
+		remove_this = true;
+		umount_this = true;
+	}
+	if (remove_this) {
+		remove_from_candidate_list(m);
+		if (umount_this)
+			umount_one(m, to_umount);
+	}
 }
 
-static void cleanup_umount_visitations(struct list_head *visited)
+static void handle_locked(struct mount *m, struct list_head *to_umount)
 {
-	while (!list_empty(visited)) {
-		struct mount *mnt =
-			list_first_entry(visited, struct mount, mnt_umounting);
-		list_del_init(&mnt->mnt_umounting);
+	struct mount *cutoff = m, *p;
+
+	if (!is_candidate(m)) { // trim_ancestors() left it on list
+		remove_from_candidate_list(m);
+		return;
+	}
+	for (p = m; is_candidate(p); p = p->mnt_parent) {
+		remove_from_candidate_list(p);
+		if (!IS_MNT_LOCKED(p))
+			cutoff = p->mnt_parent;
+	}
+	if (will_be_unmounted(p))
+		cutoff = p;
+	while (m != cutoff) {
+		umount_one(m, to_umount);
+		m = m->mnt_parent;
 	}
 }
 
 /*
- * collect all mounts that receive propagation from the mount in @list,
- * and return these additional mounts in the same list.
- * @list: the list of mounts to be unmounted.
+ * @m is not to going away, and it overmounts the top of a stack of mounts
+ * that are going away.  We know that all of those are fully overmounted
+ * by the one above (@m being the topmost of the chain), so @m can be slid
+ * in place where the bottom of the stack is attached.
  *
- * vfsmount lock must be held for write
+ * NOTE: here we temporarily violate a constraint - two mounts end up with
+ * the same parent and mountpoint; that will be remedied as soon as we
+ * return from propagate_umount() - its caller (umount_tree()) will detach
+ * the stack from the parent it (and now @m) is attached to.  umount_tree()
+ * might choose to keep unmounted pieces stuck to each other, but it always
+ * detaches them from the mounts that remain in the tree.
  */
-int propagate_umount(struct list_head *list)
+static void reparent(struct mount *m)
 {
-	struct mount *mnt;
-	LIST_HEAD(to_restore);
-	LIST_HEAD(to_umount);
-	LIST_HEAD(visited);
-
-	/* Find candidates for unmounting */
-	list_for_each_entry_reverse(mnt, list, mnt_list) {
-		struct mount *parent = mnt->mnt_parent;
-		struct mount *m;
+	struct mount *p = m;
+	struct mountpoint *mp;
 
-		/*
-		 * If this mount has already been visited it is known that it's
-		 * entire peer group and all of their slaves in the propagation
-		 * tree for the mountpoint has already been visited and there is
-		 * no need to visit them again.
-		 */
-		if (!list_empty(&mnt->mnt_umounting))
-			continue;
+	do {
+		mp = p->mnt_mp;
+		p = p->mnt_parent;
+	} while (will_be_unmounted(p));
 
-		list_add_tail(&mnt->mnt_umounting, &visited);
-		for (m = propagation_next(parent, parent); m;
-		     m = propagation_next(m, parent)) {
-			struct mount *child = __lookup_mnt(&m->mnt,
-							   mnt->mnt_mountpoint);
-			if (!child)
-				continue;
+	mnt_change_mountpoint(p, mp, m);
+	mnt_notify_add(m);
+}
 
-			if (!list_empty(&child->mnt_umounting)) {
-				/*
-				 * If the child has already been visited it is
-				 * know that it's entire peer group and all of
-				 * their slaves in the propgation tree for the
-				 * mountpoint has already been visited and there
-				 * is no need to visit this subtree again.
-				 */
-				m = skip_propagation_subtree(m, parent);
-				continue;
-			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
-				/*
-				 * We have come across a partially unmounted
-				 * mount in a list that has not been visited
-				 * yet. Remember it has been visited and
-				 * continue about our merry way.
-				 */
-				list_add_tail(&child->mnt_umounting, &visited);
-				continue;
-			}
+/**
+ * propagate_umount - apply propagation rules to the set of mounts for umount()
+ * @set: the list of mounts to be unmounted.
+ *
+ * Collect all mounts that receive propagation from the mount in @set and have
+ * no obstacles to being unmounted.  Add these additional mounts to the set.
+ *
+ * See Documentation/filesystems/propagate_umount.txt if you do anything in
+ * this area.
+ *
+ * Locks held:
+ * mount_lock (write_seqlock), namespace_sem (exclusive).
+ */
+void propagate_umount(struct list_head *set)
+{
+	struct mount *m, *p;
+	LIST_HEAD(to_umount);	// committed to unmounting
+	LIST_HEAD(candidates);	// undecided umount candidates
 
-			/* Check the child and parents while progress is made */
-			while (__propagate_umount(child,
-						  &to_umount, &to_restore)) {
-				/* Is the parent a umount candidate? */
-				child = child->mnt_parent;
-				if (list_empty(&child->mnt_umounting))
-					break;
-			}
-		}
+	// collect all candidates
+	gather_candidates(set, &candidates);
+
+	// reduce the set until it's non-shifting
+	list_for_each_entry_safe(m, p, &candidates, mnt_list)
+		trim_one(m, &to_umount);
+
+	// ... and non-revealing
+	while (!list_empty(&candidates)) {
+		m = list_first_entry(&candidates,struct mount, mnt_list);
+		handle_locked(m, &to_umount);
 	}
 
-	umount_list(&to_umount, &to_restore);
-	restore_mounts(&to_restore);
-	cleanup_umount_visitations(&visited);
-	list_splice_tail(&to_umount, list);
+	// now to_umount consists of all acceptable candidates
+	// deal with reparenting of remaining overmounts on those
+	list_for_each_entry(m, &to_umount, mnt_list) {
+		if (m->overmount)
+			reparent(m->overmount);
+	}
 
-	return 0;
+	// and fold them into the set
+	list_splice_tail_init(&to_umount, set);
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index 93fa9311bd07ff..04f1ac53aa4979 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -42,7 +42,7 @@ static inline bool peers(const struct mount *m1, const struct mount *m2)
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 		struct hlist_head *);
-int propagate_umount(struct list_head *);
+void propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
 void propagate_mount_unlock(struct mount *);
 void mnt_release_group_id(struct mount *);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c145820fcbbf30..65fa8442c00ac7 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -40,6 +40,7 @@ enum mount_flags {
 
 	MNT_INTERNAL	= 0x4000,
 
+	MNT_UMOUNT_CANDIDATE	= 0x020000,
 	MNT_LOCK_ATIME		= 0x040000,
 	MNT_LOCK_NOEXEC		= 0x080000,
 	MNT_LOCK_NOSUID		= 0x100000,
@@ -66,7 +67,7 @@ enum mount_flags {
 
 	MNT_INTERNAL_FLAGS = MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL |
 			     MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED |
-			     MNT_LOCKED,
+			     MNT_LOCKED | MNT_UMOUNT_CANDIDATE,
 };
 
 struct vfsmount {
-- 
cgit 1.2.3-korg


From 493a4bebf5157a5da64e36f8d468ff80a859b563 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 27 Apr 2025 22:53:13 -0400
Subject: don't have mounts pin their parents

Simplify the rules for mount refcounts.  Current rules include:
	* being a namespace root => +1
	* being someone's child => +1
	* being someone's child => +1 to parent's refcount, unless you've
				   already been through umount_tree().

The last part is not needed at all.  It makes for more places where need
to decrement refcounts and it creates an asymmetry between the situations
for something that has never been a part of a namespace and something that
left one, both for no good reason.

If mount's refcount has additions from its children, we know that
	* it's either someone's child itself (and will remain so
until umount_tree(), at which point contributions from children
will disappear), or
	* or is the root of namespace (and will remain such until
it either becomes someone's child in another namespace or goes through
umount_tree()), or
	* it is the root of some tree copy, and is currently pinned
by the caller of copy_tree() (and remains such until it either gets
into namespace, or goes to umount_tree()).
In all cases we already have contribution(s) to refcount that will last
as long as the contribution from children remains.  In other words, the
lifetime is not affected by refcount contributions from children.

It might be useful for "is it busy" checks, but those are actually
no harder to express without it.

NB: propagate_mnt_busy() part is an equivalent transformation, ugly as it
is; the current logics is actually wrong and may give false negatives,
but fixing that is for a separate patch (probably earlier in the queue).

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 31 +++++++++----------------------
 fs/pnode.c     | 49 +++++++++++++++++--------------------------------
 2 files changed, 26 insertions(+), 54 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/namespace.c b/fs/namespace.c
index 6df0436bfcb974..4bdf6a6e75cab0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1075,7 +1075,6 @@ void mnt_set_mountpoint(struct mount *mnt,
 			struct mountpoint *mp,
 			struct mount *child_mnt)
 {
-	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
 	child_mnt->mnt_mountpoint = mp->m_dentry;
 	child_mnt->mnt_parent = mnt;
 	child_mnt->mnt_mp = mp;
@@ -1118,7 +1117,6 @@ static void attach_mnt(struct mount *mnt, struct mount *parent,
 void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
 {
 	struct mountpoint *old_mp = mnt->mnt_mp;
-	struct mount *old_parent = mnt->mnt_parent;
 
 	list_del_init(&mnt->mnt_child);
 	hlist_del_init(&mnt->mnt_mp_list);
@@ -1127,7 +1125,6 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
 	attach_mnt(mnt, parent, mp);
 
 	maybe_free_mountpoint(old_mp, &ex_mountpoints);
-	mnt_add_count(old_parent, -1);
 }
 
 static inline struct mount *node_to_mount(struct rb_node *node)
@@ -1652,23 +1649,19 @@ const struct seq_operations mounts_op = {
 int may_umount_tree(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
-	int actual_refs = 0;
-	int minimum_refs = 0;
-	struct mount *p;
-	BUG_ON(!m);
+	bool busy = false;
 
 	/* write lock needed for mnt_get_count */
 	lock_mount_hash();
-	for (p = mnt; p; p = next_mnt(p, mnt)) {
-		actual_refs += mnt_get_count(p);
-		minimum_refs += 2;
+	for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) {
+		if (mnt_get_count(p) > (p == mnt ? 2 : 1)) {
+			busy = true;
+			break;
+		}
 	}
 	unlock_mount_hash();
 
-	if (actual_refs > minimum_refs)
-		return 0;
-
-	return 1;
+	return !busy;
 }
 
 EXPORT_SYMBOL(may_umount_tree);
@@ -1869,7 +1862,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 
 		disconnect = disconnect_mount(p, how);
 		if (mnt_has_parent(p)) {
-			mnt_add_count(p->mnt_parent, -1);
 			if (!disconnect) {
 				/* Don't forget about p */
 				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
@@ -1946,7 +1938,7 @@ static int do_umount(struct mount *mnt, int flags)
 		 * all race cases, but it's a slowpath.
 		 */
 		lock_mount_hash();
-		if (mnt_get_count(mnt) != 2) {
+		if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) {
 			unlock_mount_hash();
 			return -EBUSY;
 		}
@@ -3683,9 +3675,7 @@ static int do_move_mount(struct path *old_path,
 out:
 	unlock_mount(&mp);
 	if (!err) {
-		if (!is_anon_ns(ns)) {
-			mntput_no_expire(parent);
-		} else {
+		if (is_anon_ns(ns)) {
 			/* Make sure we notice when we leak mounts. */
 			VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
 			free_mnt_ns(ns);
@@ -4753,7 +4743,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	/* mount new_root on / */
 	attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp);
 	umount_mnt(root_mnt);
-	mnt_add_count(root_parent, -1);
 	/* mount old root on put_old */
 	attach_mnt(root_mnt, old_mnt, old_mp.mp);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
@@ -4766,8 +4755,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	error = 0;
 out4:
 	unlock_mount(&old_mp);
-	if (!error)
-		mntput_no_expire(ex_parent);
 out3:
 	path_put(&root);
 out2:
diff --git a/fs/pnode.c b/fs/pnode.c
index 901d40946d3414..827d71736ac5bd 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -332,21 +332,6 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 	return ret;
 }
 
-static struct mount *find_topper(struct mount *mnt)
-{
-	/* If there is exactly one mount covering mnt completely return it. */
-	struct mount *child;
-
-	if (!list_is_singular(&mnt->mnt_mounts))
-		return NULL;
-
-	child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
-	if (child->mnt_mountpoint != mnt->mnt.mnt_root)
-		return NULL;
-
-	return child;
-}
-
 /*
  * return true if the refcount is greater than count
  */
@@ -404,12 +389,8 @@ bool propagation_would_overmount(const struct mount *from,
  */
 int propagate_mount_busy(struct mount *mnt, int refcnt)
 {
-	struct mount *m, *child, *topper;
 	struct mount *parent = mnt->mnt_parent;
 
-	if (mnt == parent)
-		return do_refcount_check(mnt, refcnt);
-
 	/*
 	 * quickly check if the current mount can be unmounted.
 	 * If not, we don't have to go checking for all other
@@ -418,23 +399,27 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
 	if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
 		return 1;
 
-	for (m = propagation_next(parent, parent); m;
+	if (mnt == parent)
+		return 0;
+
+	for (struct mount *m = propagation_next(parent, parent); m;
 	     		m = propagation_next(m, parent)) {
-		int count = 1;
-		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
-		if (!child)
-			continue;
+		struct list_head *head;
+		struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
 
-		/* Is there exactly one mount on the child that covers
-		 * it completely whose reference should be ignored?
-		 */
-		topper = find_topper(child);
-		if (topper)
-			count += 1;
-		else if (!list_empty(&child->mnt_mounts))
+		if (!child)
 			continue;
 
-		if (do_refcount_check(child, count))
+		head = &child->mnt_mounts;
+		if (!list_empty(head)) {
+			/*
+			 * a mount that covers child completely wouldn't prevent
+			 * it being pulled out; any other would.
+			 */
+			if (!list_is_singular(head) || !child->overmount)
+				continue;
+		}
+		if (do_refcount_check(child, 1))
 			return 1;
 	}
 	return 0;
-- 
cgit 1.2.3-korg


From 406fea79992561f47fd3511dd8b7c8abeeff7045 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 21 Jun 2025 18:06:19 -0400
Subject: mount: separate the flags accessed only under namespace_sem

Several flags are updated and checked only under namespace_sem; we are
already making use of that when we are checking them without mount_lock,
but we have to hold mount_lock for all updates, which makes things
clumsier than they have to be.

Take MNT_SHARED, MNT_UNBINDABLE, MNT_MARKED and MNT_UMOUNT_CANDIDATE
into a separate field (->mnt_t_flags), renaming them to T_SHARED,
etc. to avoid confusion.  All accesses must be under namespace_sem.

That changes locking requirements for mnt_change_propagation() and
set_mnt_shared() - only namespace_sem is needed now.  The same goes
for SET_MNT_MARKED et.al.

There might be more flags moved from ->mnt_flags to that field;
this is just the initial set.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/propagate_umount.txt | 12 ++++++------
 fs/mount.h                                     | 17 +++++++++++++++++
 fs/namespace.c                                 |  4 ----
 fs/pnode.c                                     | 22 ++++++++++------------
 fs/pnode.h                                     | 19 +++++++++++--------
 include/linux/mount.h                          | 18 ++----------------
 6 files changed, 46 insertions(+), 46 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/Documentation/filesystems/propagate_umount.txt b/Documentation/filesystems/propagate_umount.txt
index 6906903a8aa29f..c90349e5b889fb 100644
--- a/Documentation/filesystems/propagate_umount.txt
+++ b/Documentation/filesystems/propagate_umount.txt
@@ -453,11 +453,11 @@ original set.
 So let's go for
 	* original set ("set").  Linkage via mnt_list
 	* undecided candidates ("candidates").  Subset of a list,
-consisting of all its elements marked with a new flag (MNT_UMOUNT_CANDIDATE).
+consisting of all its elements marked with a new flag (T_UMOUNT_CANDIDATE).
 Initially all elements of the list will be marked that way; in the
 end the list will become empty and no mounts will remain marked with
 that flag.
-	* Reuse MNT_MARKED for "has been already seen by trim_ancestors()".
+	* Reuse T_MARKED for "has been already seen by trim_ancestors()".
 	* anything in U that hadn't been in the original set - elements of
 candidates will gradually be either discarded or moved there.  In other
 words, it's the candidates we have already decided to unmount.	Its role
@@ -465,13 +465,13 @@ is reasonably close to the old "to_umount", so let's use that name.
 Linkage via mnt_list.
 
 For gather_candidates() we'll need to maintain both candidates (S -
-set) and intersection of S with set.  Use MNT_UMOUNT_CANDIDATE for
+set) and intersection of S with set.  Use T_UMOUNT_CANDIDATE for
 all elements we encounter, putting the ones not already in the original
 set into the list of candidates.  When we are done, strip that flag from
 all elements of the original set.  That gives a cheap way to check
 if element belongs to S (in gather_candidates) and to candidates
 itself (at later stages).  Call that predicate is_candidate(); it would
-be m->mnt_flags & MNT_UMOUNT_CANDIDATE.
+be m->mnt_t_flags & T_UMOUNT_CANDIDATE.
 
 All elements of the original set are marked with MNT_UMOUNT and we'll
 need the same for elements added when joining the contents of to_umount
@@ -480,5 +480,5 @@ to to_umount; that's close to what the old 'umount_one' is doing, so
 let's keep that name.  It also gives us another predicate we need -
 "belongs to union of set and to_umount"; will_be_unmounted() for now.
 
-Removals from the candidates list should strip both MNT_MARKED and
-MNT_UMOUNT_CANDIDATE; call it remove_from_candidates_list().
+Removals from the candidates list should strip both T_MARKED and
+T_UMOUNT_CANDIDATE; call it remove_from_candidates_list().
diff --git a/fs/mount.h b/fs/mount.h
index 4355c482a841ce..f299dc85446d70 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -84,6 +84,7 @@ struct mount {
 	struct list_head to_notify;	/* need to queue notification */
 	struct mnt_namespace *prev_ns;	/* previous namespace (NULL if none) */
 #endif
+	int mnt_t_flags;		/* namespace_sem-protected flags */
 	int mnt_id;			/* mount identifier, reused */
 	u64 mnt_id_unique;		/* mount ID unique until reboot */
 	int mnt_group_id;		/* peer group identifier */
@@ -93,6 +94,22 @@ struct mount {
 	struct mount *overmount;	/* mounted on ->mnt_root */
 } __randomize_layout;
 
+enum {
+	T_SHARED		= 1, /* mount is shared */
+	T_UNBINDABLE		= 2, /* mount is unbindable */
+	T_MARKED		= 4, /* internal mark for propagate_... */
+	T_UMOUNT_CANDIDATE	= 8, /* for propagate_umount */
+
+	/*
+	 * T_SHARED_MASK is the set of flags that should be cleared when a
+	 * mount becomes shared.  Currently, this is only the flag that says a
+	 * mount cannot be bind mounted, since this is how we create a mount
+	 * that shares events with another mount.  If you add a new T_*
+	 * flag, consider how it interacts with shared mounts.
+	 */
+	T_SHARED_MASK	= T_UNBINDABLE,
+};
+
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
 
 static inline struct mount *real_mount(struct vfsmount *mnt)
diff --git a/fs/namespace.c b/fs/namespace.c
index 4bdf6a6e75cab0..da27365418a5a6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2917,10 +2917,8 @@ static int do_change_type(struct path *path, int ms_flags)
 			goto out_unlock;
 	}
 
-	lock_mount_hash();
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
-	unlock_mount_hash();
 
  out_unlock:
 	namespace_unlock();
@@ -3409,9 +3407,7 @@ static int do_set_group(struct path *from_path, struct path *to_path)
 	if (IS_MNT_SHARED(from)) {
 		to->mnt_group_id = from->mnt_group_id;
 		list_add(&to->mnt_share, &from->mnt_share);
-		lock_mount_hash();
 		set_mnt_shared(to);
-		unlock_mount_hash();
 	}
 
 	err = 0;
diff --git a/fs/pnode.c b/fs/pnode.c
index 827d71736ac5bd..b997663de6d0b9 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -112,7 +112,7 @@ static int do_make_slave(struct mount *mnt)
 }
 
 /*
- * vfsmount lock must be held for write
+ * EXCL[namespace_sem]
  */
 void change_mnt_propagation(struct mount *mnt, int type)
 {
@@ -125,9 +125,9 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		list_del_init(&mnt->mnt_slave);
 		mnt->mnt_master = NULL;
 		if (type == MS_UNBINDABLE)
-			mnt->mnt.mnt_flags |= MNT_UNBINDABLE;
+			mnt->mnt_t_flags |= T_UNBINDABLE;
 		else
-			mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE;
+			mnt->mnt_t_flags &= ~T_UNBINDABLE;
 	}
 }
 
@@ -263,9 +263,9 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 		return PTR_ERR(child);
 	read_seqlock_excl(&mount_lock);
 	mnt_set_mountpoint(m, dest_mp, child);
+	read_sequnlock_excl(&mount_lock);
 	if (m->mnt_master != dest_master)
 		SET_MNT_MARK(m->mnt_master);
-	read_sequnlock_excl(&mount_lock);
 	last_dest = m;
 	last_source = child;
 	hlist_add_head(&child->mnt_hash, list);
@@ -322,13 +322,11 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		} while (n != m);
 	}
 out:
-	read_seqlock_excl(&mount_lock);
 	hlist_for_each_entry(n, tree_list, mnt_hash) {
 		m = n->mnt_parent;
 		if (m->mnt_master != dest_mnt->mnt_master)
 			CLEAR_MNT_MARK(m->mnt_master);
 	}
-	read_sequnlock_excl(&mount_lock);
 	return ret;
 }
 
@@ -447,7 +445,7 @@ void propagate_mount_unlock(struct mount *mnt)
 
 static inline bool is_candidate(struct mount *m)
 {
-	return m->mnt.mnt_flags & MNT_UMOUNT_CANDIDATE;
+	return m->mnt_t_flags & T_UMOUNT_CANDIDATE;
 }
 
 static inline bool will_be_unmounted(struct mount *m)
@@ -464,7 +462,7 @@ static void umount_one(struct mount *m, struct list_head *to_umount)
 
 static void remove_from_candidate_list(struct mount *m)
 {
-	m->mnt.mnt_flags &= ~(MNT_MARKED | MNT_UMOUNT_CANDIDATE);
+	m->mnt_t_flags &= ~(T_MARKED | T_UMOUNT_CANDIDATE);
 	list_del_init(&m->mnt_list);
 }
 
@@ -476,7 +474,7 @@ static void gather_candidates(struct list_head *set,
 	list_for_each_entry(m, set, mnt_list) {
 		if (is_candidate(m))
 			continue;
-		m->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
+		m->mnt_t_flags |= T_UMOUNT_CANDIDATE;
 		p = m->mnt_parent;
 		q = propagation_next(p, p);
 		while (q) {
@@ -494,7 +492,7 @@ static void gather_candidates(struct list_head *set,
 					q = skip_propagation_subtree(q, p);
 					continue;
 				}
-				child->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
+				child->mnt_t_flags |= T_UMOUNT_CANDIDATE;
 				if (!will_be_unmounted(child))
 					list_add(&child->mnt_list, candidates);
 			}
@@ -502,7 +500,7 @@ static void gather_candidates(struct list_head *set,
 		}
 	}
 	list_for_each_entry(m, set, mnt_list)
-		m->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
+		m->mnt_t_flags &= ~T_UMOUNT_CANDIDATE;
 }
 
 /*
@@ -519,7 +517,7 @@ static void trim_ancestors(struct mount *m)
 			return;
 		SET_MNT_MARK(m);
 		if (m != p->overmount)
-			p->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
+			p->mnt_t_flags &= ~T_UMOUNT_CANDIDATE;
 	}
 }
 
diff --git a/fs/pnode.h b/fs/pnode.h
index 04f1ac53aa4979..507e30e7a42040 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -10,14 +10,14 @@
 #include <linux/list.h>
 #include "mount.h"
 
-#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
+#define IS_MNT_SHARED(m) ((m)->mnt_t_flags & T_SHARED)
 #define IS_MNT_SLAVE(m) ((m)->mnt_master)
 #define IS_MNT_NEW(m) (!(m)->mnt_ns)
-#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
-#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
-#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
-#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
-#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
+#define CLEAR_MNT_SHARED(m) ((m)->mnt_t_flags &= ~T_SHARED)
+#define IS_MNT_UNBINDABLE(m) ((m)->mnt_t_flags & T_UNBINDABLE)
+#define IS_MNT_MARKED(m) ((m)->mnt_t_flags & T_MARKED)
+#define SET_MNT_MARK(m) ((m)->mnt_t_flags |= T_MARKED)
+#define CLEAR_MNT_MARK(m) ((m)->mnt_t_flags &= ~T_MARKED)
 #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
 
 #define CL_EXPIRE    		0x01
@@ -28,10 +28,13 @@
 #define CL_SHARED_TO_SLAVE	0x20
 #define CL_COPY_MNT_NS_FILE	0x40
 
+/*
+ * EXCL[namespace_sem]
+ */
 static inline void set_mnt_shared(struct mount *mnt)
 {
-	mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK;
-	mnt->mnt.mnt_flags |= MNT_SHARED;
+	mnt->mnt_t_flags &= ~T_SHARED_MASK;
+	mnt->mnt_t_flags |= T_SHARED;
 }
 
 static inline bool peers(const struct mount *m1, const struct mount *m2)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 65fa8442c00ac7..5f9c053b08971a 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -35,12 +35,8 @@ enum mount_flags {
 	MNT_SHRINKABLE	= 0x100,
 	MNT_WRITE_HOLD	= 0x200,
 
-	MNT_SHARED	= 0x1000, /* if the vfsmount is a shared mount */
-	MNT_UNBINDABLE	= 0x2000, /* if the vfsmount is a unbindable mount */
-
 	MNT_INTERNAL	= 0x4000,
 
-	MNT_UMOUNT_CANDIDATE	= 0x020000,
 	MNT_LOCK_ATIME		= 0x040000,
 	MNT_LOCK_NOEXEC		= 0x080000,
 	MNT_LOCK_NOSUID		= 0x100000,
@@ -49,25 +45,15 @@ enum mount_flags {
 	MNT_LOCKED		= 0x800000,
 	MNT_DOOMED		= 0x1000000,
 	MNT_SYNC_UMOUNT		= 0x2000000,
-	MNT_MARKED		= 0x4000000,
 	MNT_UMOUNT		= 0x8000000,
 
-	/*
-	 * MNT_SHARED_MASK is the set of flags that should be cleared when a
-	 * mount becomes shared.  Currently, this is only the flag that says a
-	 * mount cannot be bind mounted, since this is how we create a mount
-	 * that shares events with another mount.  If you add a new MNT_*
-	 * flag, consider how it interacts with shared mounts.
-	 */
-	MNT_SHARED_MASK	= MNT_UNBINDABLE,
 	MNT_USER_SETTABLE_MASK  = MNT_NOSUID | MNT_NODEV | MNT_NOEXEC
 				  | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME
 				  | MNT_READONLY | MNT_NOSYMFOLLOW,
 	MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME,
 
-	MNT_INTERNAL_FLAGS = MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL |
-			     MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED |
-			     MNT_LOCKED | MNT_UMOUNT_CANDIDATE,
+	MNT_INTERNAL_FLAGS = MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED |
+			     MNT_SYNC_UMOUNT | MNT_LOCKED
 };
 
 struct vfsmount {
-- 
cgit 1.2.3-korg


From 25776a09d802f4e4d8c0bd72042934223286c2e3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 21 Jun 2025 17:41:40 -0400
Subject: propagate_one(): get rid of dest_master

propagate_mnt() takes the subtree we are about to attach and creates
its copies, setting the propagation between those.  Each copy is cloned
either from the original or from one of the already created copies.
The tricky part is choosing the right copy to serve as a master when we
are starting a new peer group.

The algorithm for doing that selection puts temporary marks on the masters
of mountpoints that already got a copy created for them; since the initial
peer group might have no master at all, we need to special-case that when
looking for the mark.  Currently we do that by memorizing the master of
original peer group.  It works, but we get yet another piece of data to
pass from propagate_mnt() to propagate_one().

Alternative is to mark the master of original peer group if not NULL,
turning the check into "master is NULL or marked".  Less data to pass
around and memory safety is more obvious that way...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index b997663de6d0b9..870ebced10aa77 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -215,7 +215,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 }
 
 /* all accesses are serialized by namespace_sem */
-static struct mount *last_dest, *first_source, *last_source, *dest_master;
+static struct mount *last_dest, *first_source, *last_source;
 static struct hlist_head *list;
 
 static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
@@ -239,7 +239,7 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 		bool done;
 		for (n = m; ; n = p) {
 			p = n->mnt_master;
-			if (p == dest_master || IS_MNT_MARKED(p))
+			if (!p || IS_MNT_MARKED(p))
 				break;
 		}
 		do {
@@ -264,7 +264,7 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 	read_seqlock_excl(&mount_lock);
 	mnt_set_mountpoint(m, dest_mp, child);
 	read_sequnlock_excl(&mount_lock);
-	if (m->mnt_master != dest_master)
+	if (m->mnt_master)
 		SET_MNT_MARK(m->mnt_master);
 	last_dest = m;
 	last_source = child;
@@ -300,7 +300,8 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 	first_source = source_mnt;
 	last_source = source_mnt;
 	list = tree_list;
-	dest_master = dest_mnt->mnt_master;
+	if (dest_mnt->mnt_master)
+		SET_MNT_MARK(dest_mnt->mnt_master);
 
 	/* all peers of dest_mnt, except dest_mnt itself */
 	for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
@@ -324,9 +325,11 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 out:
 	hlist_for_each_entry(n, tree_list, mnt_hash) {
 		m = n->mnt_parent;
-		if (m->mnt_master != dest_mnt->mnt_master)
+		if (m->mnt_master)
 			CLEAR_MNT_MARK(m->mnt_master);
 	}
+	if (dest_mnt->mnt_master)
+		CLEAR_MNT_MARK(dest_mnt->mnt_master);
 	return ret;
 }
 
-- 
cgit 1.2.3-korg


From 2b2a34793dc239b0eaebe9559557d051524a7297 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 22:56:43 -0400
Subject: propagate_mnt(): handle all peer groups in the same loop

the only difference is that for the original group we want to skip
the first element; not worth having the logics twice...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index 870ebced10aa77..f55295e2621707 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -289,7 +289,7 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		    struct mount *source_mnt, struct hlist_head *tree_list)
 {
 	struct mount *m, *n;
-	int ret = 0;
+	int err = 0;
 
 	/*
 	 * we don't want to bother passing tons of arguments to
@@ -303,26 +303,23 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 	if (dest_mnt->mnt_master)
 		SET_MNT_MARK(dest_mnt->mnt_master);
 
-	/* all peers of dest_mnt, except dest_mnt itself */
-	for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
-		ret = propagate_one(n, dest_mp);
-		if (ret)
-			goto out;
-	}
-
-	/* all slave groups */
-	for (m = next_group(dest_mnt, dest_mnt); m;
-			m = next_group(m, dest_mnt)) {
-		/* everything in that slave group */
-		n = m;
+	/* iterate over peer groups, depth first */
+	for (m = dest_mnt; m && !err; m = next_group(m, dest_mnt)) {
+		if (m == dest_mnt) { // have one for dest_mnt itself
+			n = next_peer(m);
+			if (n == m)
+				continue;
+		} else {
+			n = m;
+		}
 		do {
-			ret = propagate_one(n, dest_mp);
-			if (ret)
-				goto out;
+			err = propagate_one(n, dest_mp);
+			if (err)
+				break;
 			n = next_peer(n);
 		} while (n != m);
 	}
-out:
+
 	hlist_for_each_entry(n, tree_list, mnt_hash) {
 		m = n->mnt_parent;
 		if (m->mnt_master)
@@ -330,7 +327,7 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 	}
 	if (dest_mnt->mnt_master)
 		CLEAR_MNT_MARK(dest_mnt->mnt_master);
-	return ret;
+	return err;
 }
 
 /*
-- 
cgit 1.2.3-korg


From 15e710b8bbb5c537c39ccaa23963d01c76946834 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 23:04:23 -0400
Subject: propagate_one(): separate the "do we need secondary here?" logics

take the checks into separate helper - need_secondary(mount, mountpoint).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index f55295e2621707..7c832f98595cf8 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -218,19 +218,24 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 static struct mount *last_dest, *first_source, *last_source;
 static struct hlist_head *list;
 
-static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
+static bool need_secondary(struct mount *m, struct mountpoint *dest_mp)
 {
-	struct mount *child;
-	int type;
 	/* skip ones added by this propagate_mnt() */
 	if (IS_MNT_NEW(m))
-		return 0;
+		return false;
 	/* skip if mountpoint isn't visible in m */
 	if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
-		return 0;
+		return false;
 	/* skip if m is in the anon_ns */
 	if (is_anon_ns(m->mnt_ns))
-		return 0;
+		return false;
+	return true;
+}
+
+static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
+{
+	struct mount *child;
+	int type;
 
 	if (peers(m, last_dest)) {
 		type = CL_MAKE_SHARED;
@@ -313,11 +318,12 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 			n = m;
 		}
 		do {
+			if (!need_secondary(n, dest_mp))
+				continue;
 			err = propagate_one(n, dest_mp);
 			if (err)
 				break;
-			n = next_peer(n);
-		} while (n != m);
+		} while ((n = next_peer(n)) != m);
 	}
 
 	hlist_for_each_entry(n, tree_list, mnt_hash) {
-- 
cgit 1.2.3-korg


From e0f9396e244c0dcc29921ebc3254cb25d6eb31cf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 23:09:47 -0400
Subject: propagate_one(): separate the "what should be the master for this
 copy" part

When we create the first copy for a peer group, it becomes a slave of
one of the existing copies; take that logics into a separate helper -
find_master(parent, last_copy, original).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index 7c832f98595cf8..94de8aad4da567 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -232,6 +232,31 @@ static bool need_secondary(struct mount *m, struct mountpoint *dest_mp)
 	return true;
 }
 
+static struct mount *find_master(struct mount *m,
+				struct mount *last_copy,
+				struct mount *original)
+{
+	struct mount *p;
+
+	// ascend until there's a copy for something with the same master
+	for (;;) {
+		p = m->mnt_master;
+		if (!p || IS_MNT_MARKED(p))
+			break;
+		m = p;
+	}
+	while (!peers(last_copy, original)) {
+		struct mount *parent = last_copy->mnt_parent;
+		if (parent->mnt_master == p) {
+			if (!peers(parent, m))
+				last_copy = last_copy->mnt_master;
+			break;
+		}
+		last_copy = last_copy->mnt_master;
+	}
+	return last_copy;
+}
+
 static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 {
 	struct mount *child;
@@ -240,23 +265,7 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 	if (peers(m, last_dest)) {
 		type = CL_MAKE_SHARED;
 	} else {
-		struct mount *n, *p;
-		bool done;
-		for (n = m; ; n = p) {
-			p = n->mnt_master;
-			if (!p || IS_MNT_MARKED(p))
-				break;
-		}
-		do {
-			struct mount *parent = last_source->mnt_parent;
-			if (peers(last_source, first_source))
-				break;
-			done = parent->mnt_master == p;
-			if (done && peers(n, parent))
-				break;
-			last_source = last_source->mnt_master;
-		} while (!done);
-
+		last_source = find_master(m, last_source, first_source);
 		type = CL_SLAVE;
 		/* beginning of peer group among the slaves? */
 		if (IS_MNT_SHARED(m))
-- 
cgit 1.2.3-korg


From 6a2ce2a74bfeee4b66411177125f3b5749003e7f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 23:16:52 -0400
Subject: propagate_one(): fold into the sole caller

mechanical expansion; will be cleaned up on the next step

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 57 +++++++++++++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 32 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index 94de8aad4da567..aeaec24f7456e3 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -257,35 +257,6 @@ static struct mount *find_master(struct mount *m,
 	return last_copy;
 }
 
-static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
-{
-	struct mount *child;
-	int type;
-
-	if (peers(m, last_dest)) {
-		type = CL_MAKE_SHARED;
-	} else {
-		last_source = find_master(m, last_source, first_source);
-		type = CL_SLAVE;
-		/* beginning of peer group among the slaves? */
-		if (IS_MNT_SHARED(m))
-			type |= CL_MAKE_SHARED;
-	}
-		
-	child = copy_tree(last_source, last_source->mnt.mnt_root, type);
-	if (IS_ERR(child))
-		return PTR_ERR(child);
-	read_seqlock_excl(&mount_lock);
-	mnt_set_mountpoint(m, dest_mp, child);
-	read_sequnlock_excl(&mount_lock);
-	if (m->mnt_master)
-		SET_MNT_MARK(m->mnt_master);
-	last_dest = m;
-	last_source = child;
-	hlist_add_head(&child->mnt_hash, list);
-	return count_mounts(m->mnt_ns, child);
-}
-
 /*
  * mount 'source_mnt' under the destination 'dest_mnt' at
  * dentry 'dest_dentry'. And propagate that mount to
@@ -302,8 +273,8 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		    struct mount *source_mnt, struct hlist_head *tree_list)
 {
-	struct mount *m, *n;
-	int err = 0;
+	struct mount *m, *n, *child;
+	int err = 0, type;
 
 	/*
 	 * we don't want to bother passing tons of arguments to
@@ -329,7 +300,29 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		do {
 			if (!need_secondary(n, dest_mp))
 				continue;
-			err = propagate_one(n, dest_mp);
+			if (peers(n, last_dest)) {
+				type = CL_MAKE_SHARED;
+			} else {
+				last_source = find_master(n, last_source, first_source);
+				type = CL_SLAVE;
+				/* beginning of peer group among the slaves? */
+				if (IS_MNT_SHARED(n))
+					type |= CL_MAKE_SHARED;
+			}
+			child = copy_tree(last_source, last_source->mnt.mnt_root, type);
+			if (IS_ERR(child)) {
+				err = PTR_ERR(child);
+				break;
+			}
+			read_seqlock_excl(&mount_lock);
+			mnt_set_mountpoint(n, dest_mp, child);
+			read_sequnlock_excl(&mount_lock);
+			if (n->mnt_master)
+				SET_MNT_MARK(n->mnt_master);
+			last_dest = n;
+			last_source = child;
+			hlist_add_head(&child->mnt_hash, list);
+			err = count_mounts(n->mnt_ns, child);
 			if (err)
 				break;
 		} while ((n = next_peer(n)) != m);
-- 
cgit 1.2.3-korg


From bc88530a20b1d5c78288ef5383d10b66d3242c48 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 23:21:57 -0400
Subject: fs/pnode.c: get rid of globals

this stuff can be local in propagate_mnt() now (and in some cases
duplicates the existing variables there)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index aeaec24f7456e3..e01f43820a930d 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -214,10 +214,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 	}
 }
 
-/* all accesses are serialized by namespace_sem */
-static struct mount *last_dest, *first_source, *last_source;
-static struct hlist_head *list;
-
 static bool need_secondary(struct mount *m, struct mountpoint *dest_mp)
 {
 	/* skip ones added by this propagate_mnt() */
@@ -273,18 +269,11 @@ static struct mount *find_master(struct mount *m,
 int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		    struct mount *source_mnt, struct hlist_head *tree_list)
 {
-	struct mount *m, *n, *child;
+	struct mount *m, *n, *copy, *this, *last_dest;
 	int err = 0, type;
 
-	/*
-	 * we don't want to bother passing tons of arguments to
-	 * propagate_one(); everything is serialized by namespace_sem,
-	 * so globals will do just fine.
-	 */
 	last_dest = dest_mnt;
-	first_source = source_mnt;
-	last_source = source_mnt;
-	list = tree_list;
+	copy = source_mnt;
 	if (dest_mnt->mnt_master)
 		SET_MNT_MARK(dest_mnt->mnt_master);
 
@@ -303,26 +292,26 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 			if (peers(n, last_dest)) {
 				type = CL_MAKE_SHARED;
 			} else {
-				last_source = find_master(n, last_source, first_source);
+				copy = find_master(n, copy, source_mnt);
 				type = CL_SLAVE;
 				/* beginning of peer group among the slaves? */
 				if (IS_MNT_SHARED(n))
 					type |= CL_MAKE_SHARED;
 			}
-			child = copy_tree(last_source, last_source->mnt.mnt_root, type);
-			if (IS_ERR(child)) {
-				err = PTR_ERR(child);
+			this = copy_tree(copy, copy->mnt.mnt_root, type);
+			if (IS_ERR(this)) {
+				err = PTR_ERR(this);
 				break;
 			}
 			read_seqlock_excl(&mount_lock);
-			mnt_set_mountpoint(n, dest_mp, child);
+			mnt_set_mountpoint(n, dest_mp, this);
 			read_sequnlock_excl(&mount_lock);
 			if (n->mnt_master)
 				SET_MNT_MARK(n->mnt_master);
 			last_dest = n;
-			last_source = child;
-			hlist_add_head(&child->mnt_hash, list);
-			err = count_mounts(n->mnt_ns, child);
+			copy = this;
+			hlist_add_head(&this->mnt_hash, tree_list);
+			err = count_mounts(n->mnt_ns, this);
 			if (err)
 				break;
 		} while ((n = next_peer(n)) != m);
-- 
cgit 1.2.3-korg


From 0a10217e5cf82835b63875752b57f01bba0bf5b6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 23:27:48 -0400
Subject: propagate_mnt(): get rid of last_dest

Its only use is choosing the type of copy - CL_MAKE_SHARED if there
already is a copy in that peer group, CL_SLAVE or CL_SLAVE | CL_MAKE_SHARED
otherwise.

But that's easy to keep track of - just set type in the beginning of group
and reset to CL_MAKE_SHARED after the first created secondary in it...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index e01f43820a930d..b3af55123a82c7 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -269,35 +269,32 @@ static struct mount *find_master(struct mount *m,
 int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		    struct mount *source_mnt, struct hlist_head *tree_list)
 {
-	struct mount *m, *n, *copy, *this, *last_dest;
+	struct mount *m, *n, *copy, *this;
 	int err = 0, type;
 
-	last_dest = dest_mnt;
-	copy = source_mnt;
 	if (dest_mnt->mnt_master)
 		SET_MNT_MARK(dest_mnt->mnt_master);
 
 	/* iterate over peer groups, depth first */
 	for (m = dest_mnt; m && !err; m = next_group(m, dest_mnt)) {
 		if (m == dest_mnt) { // have one for dest_mnt itself
+			copy = source_mnt;
+			type = CL_MAKE_SHARED;
 			n = next_peer(m);
 			if (n == m)
 				continue;
 		} else {
+			type = CL_SLAVE;
+			/* beginning of peer group among the slaves? */
+			if (IS_MNT_SHARED(m))
+				type |= CL_MAKE_SHARED;
 			n = m;
 		}
 		do {
 			if (!need_secondary(n, dest_mp))
 				continue;
-			if (peers(n, last_dest)) {
-				type = CL_MAKE_SHARED;
-			} else {
+			if (type & CL_SLAVE) // first in this peer group
 				copy = find_master(n, copy, source_mnt);
-				type = CL_SLAVE;
-				/* beginning of peer group among the slaves? */
-				if (IS_MNT_SHARED(n))
-					type |= CL_MAKE_SHARED;
-			}
 			this = copy_tree(copy, copy->mnt.mnt_root, type);
 			if (IS_ERR(this)) {
 				err = PTR_ERR(this);
@@ -308,12 +305,12 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 			read_sequnlock_excl(&mount_lock);
 			if (n->mnt_master)
 				SET_MNT_MARK(n->mnt_master);
-			last_dest = n;
 			copy = this;
 			hlist_add_head(&this->mnt_hash, tree_list);
 			err = count_mounts(n->mnt_ns, this);
 			if (err)
 				break;
+			type = CL_MAKE_SHARED;
 		} while ((n = next_peer(n)) != m);
 	}
 
-- 
cgit 1.2.3-korg


From 0313356520b15deab893cd62da3e2ba9a6e61a1f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Jun 2025 23:39:23 -0400
Subject: propagate_mnt(): fix comment and convert to kernel-doc, while we are
 at it

Mountpoint is passed as struct mountpoint *, not struct dentry *
(and called dest_mp, not dest_dentry) since 2013.

Roots of created copies are linked via mnt_hash, not mnt_list since
a bit before the merge into mainline back in 2005.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index b3af55123a82c7..b887116f0041af 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -253,21 +253,20 @@ static struct mount *find_master(struct mount *m,
 	return last_copy;
 }
 
-/*
- * mount 'source_mnt' under the destination 'dest_mnt' at
- * dentry 'dest_dentry'. And propagate that mount to
- * all the peer and slave mounts of 'dest_mnt'.
- * Link all the new mounts into a propagation tree headed at
- * source_mnt. Also link all the new mounts using ->mnt_list
- * headed at source_mnt's ->mnt_list
+/**
+ * propagate_mnt() - create secondary copies for tree attachment
+ * @dest_mnt:    destination mount.
+ * @dest_mp:     destination mountpoint.
+ * @source_mnt:  source mount.
+ * @tree_list:   list of secondaries to be attached.
  *
- * @dest_mnt: destination mount.
- * @dest_dentry: destination dentry.
- * @source_mnt: source mount.
- * @tree_list : list of heads of trees to be attached.
+ * Create secondary copies for attaching a tree with root @source_mnt
+ * at mount @dest_mnt with mountpoint @dest_mp.  Link all new mounts
+ * into a propagation graph.  Set mountpoints for all secondaries,
+ * link their roots into @tree_list via ->mnt_hash.
  */
 int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
-		    struct mount *source_mnt, struct hlist_head *tree_list)
+		  struct mount *source_mnt, struct hlist_head *tree_list)
 {
 	struct mount *m, *n, *copy, *this;
 	int err = 0, type;
-- 
cgit 1.2.3-korg


From d5f15047f13b86b74d1ac4f39036ccae2078c492 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Jun 2025 23:20:47 -0400
Subject: change_mnt_propagation() cleanups, step 1

Lift changing ->mnt_slave from do_make_slave() into the caller.
Simplifies the next steps...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index b887116f0041af..14618eac20254e 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -104,7 +104,6 @@ static int do_make_slave(struct mount *mnt)
 	}
 	list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
 		slave_mnt->mnt_master = master;
-	list_move(&mnt->mnt_slave, &master->mnt_slave_list);
 	list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
 	INIT_LIST_HEAD(&mnt->mnt_slave_list);
 	mnt->mnt_master = master;
@@ -121,8 +120,12 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		return;
 	}
 	do_make_slave(mnt);
-	if (type != MS_SLAVE) {
-		list_del_init(&mnt->mnt_slave);
+	list_del_init(&mnt->mnt_slave);
+	if (type == MS_SLAVE) {
+		if (mnt->mnt_master)
+			list_add(&mnt->mnt_slave,
+				 &mnt->mnt_master->mnt_slave_list);
+	} else {
 		mnt->mnt_master = NULL;
 		if (type == MS_UNBINDABLE)
 			mnt->mnt_t_flags |= T_UNBINDABLE;
-- 
cgit 1.2.3-korg


From ef86251194de9b31f7efcf70ea480ebe736e9e60 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Jun 2025 23:25:00 -0400
Subject: change_mnt_propagation(): do_make_slave() is a no-op unless
 IS_MNT_SHARED()

... since mnt->mnt_share and mnt->mnt_slave_list are guaranteed to be empty unless
IS_MNT_SHARED(mnt).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index 14618eac20254e..9723f05cda5f15 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -70,10 +70,8 @@ static int do_make_slave(struct mount *mnt)
 	struct mount *master, *slave_mnt;
 
 	if (list_empty(&mnt->mnt_share)) {
-		if (IS_MNT_SHARED(mnt)) {
-			mnt_release_group_id(mnt);
-			CLEAR_MNT_SHARED(mnt);
-		}
+		mnt_release_group_id(mnt);
+		CLEAR_MNT_SHARED(mnt);
 		master = mnt->mnt_master;
 		if (!master) {
 			struct list_head *p = &mnt->mnt_slave_list;
@@ -119,7 +117,8 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		set_mnt_shared(mnt);
 		return;
 	}
-	do_make_slave(mnt);
+	if (IS_MNT_SHARED(mnt))
+		do_make_slave(mnt);
 	list_del_init(&mnt->mnt_slave);
 	if (type == MS_SLAVE) {
 		if (mnt->mnt_master)
-- 
cgit 1.2.3-korg


From 955336e204ab59301ff8b1f75a98a226f5a98782 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Jun 2025 18:12:56 -0400
Subject: do_make_slave(): choose new master sanely

When mount changes propagation type so that it doesn't propagate
events any more (MS_PRIVATE, MS_SLAVE, MS_UNBINDABLE), we need
to make sure that event propagation between other mounts is
unaffected.

We need to make sure that events from peers and master of that mount
(if any) still reach everything that used to be on its ->mnt_slave_list.

If mount has neither peers nor master, we simply need to dissolve
its ->mnt_slave_list and clear ->mnt_master of everything in there.

If mount has peers, we transfer everything in ->mnt_slave_list of
this mount into that of some of those peers (and adjust ->mnt_master
accordingly).

If mount has a master but no peers, we transfer everything in
->mnt_slave_list of this mount into that of its master (adjusting
->mnt_master, etc.).

There are two problems with the current implementation:
	* there's a long-obsolete logics in choosing the peer -
once upon a time it made sense to prefer the peer that had the
same ->mnt_root as our mount, but that had been pointless since
2014 ("smarter propagate_mnt()")
	* the most common caller of that thing is umount_tree()
taking the mounts out of propagation graph.  In that case it's
possible to have ->mnt_slave_list contents moved many times,
since the replacement master is likely to be taken out by the
same umount_tree(), etc.

Take the choice of replacement master into a separate function
(propagation_source()) and teach it to skip the candidates that
are going to be taken out.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 62 +++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index 9723f05cda5f15..91d10af867bdef 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -65,40 +65,45 @@ int get_dominating_id(struct mount *mnt, const struct path *root)
 	return 0;
 }
 
+static inline bool will_be_unmounted(struct mount *m)
+{
+	return m->mnt.mnt_flags & MNT_UMOUNT;
+}
+
+static struct mount *propagation_source(struct mount *mnt)
+{
+	do {
+		struct mount *m;
+		for (m = next_peer(mnt); m != mnt; m = next_peer(m)) {
+			if (!will_be_unmounted(m))
+				return m;
+		}
+		mnt = mnt->mnt_master;
+	} while (mnt && will_be_unmounted(mnt));
+	return mnt;
+}
+
 static int do_make_slave(struct mount *mnt)
 {
-	struct mount *master, *slave_mnt;
+	struct mount *master = propagation_source(mnt);
+	struct mount *slave_mnt;
 
 	if (list_empty(&mnt->mnt_share)) {
 		mnt_release_group_id(mnt);
-		CLEAR_MNT_SHARED(mnt);
-		master = mnt->mnt_master;
-		if (!master) {
-			struct list_head *p = &mnt->mnt_slave_list;
-			while (!list_empty(p)) {
-				slave_mnt = list_first_entry(p,
-						struct mount, mnt_slave);
-				list_del_init(&slave_mnt->mnt_slave);
-				slave_mnt->mnt_master = NULL;
-			}
-			return 0;
-		}
 	} else {
-		struct mount *m;
-		/*
-		 * slave 'mnt' to a peer mount that has the
-		 * same root dentry. If none is available then
-		 * slave it to anything that is available.
-		 */
-		for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) {
-			if (m->mnt.mnt_root == mnt->mnt.mnt_root) {
-				master = m;
-				break;
-			}
-		}
 		list_del_init(&mnt->mnt_share);
 		mnt->mnt_group_id = 0;
-		CLEAR_MNT_SHARED(mnt);
+	}
+	CLEAR_MNT_SHARED(mnt);
+	if (!master) {
+		struct list_head *p = &mnt->mnt_slave_list;
+		while (!list_empty(p)) {
+			slave_mnt = list_first_entry(p,
+					struct mount, mnt_slave);
+			list_del_init(&slave_mnt->mnt_slave);
+			slave_mnt->mnt_master = NULL;
+		}
+		return 0;
 	}
 	list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
 		slave_mnt->mnt_master = master;
@@ -443,11 +448,6 @@ static inline bool is_candidate(struct mount *m)
 	return m->mnt_t_flags & T_UMOUNT_CANDIDATE;
 }
 
-static inline bool will_be_unmounted(struct mount *m)
-{
-	return m->mnt.mnt_flags & MNT_UMOUNT;
-}
-
 static void umount_one(struct mount *m, struct list_head *to_umount)
 {
 	m->mnt.mnt_flags |= MNT_UMOUNT;
-- 
cgit 1.2.3-korg


From 94a8d0027606397ce58b00077bf6146f25923965 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Jun 2025 23:36:43 -0400
Subject: turn do_make_slave() into transfer_propagation()

Lift calculation of replacement propagation source, removal from
peer group and assignment of ->mnt_master from do_make_slave() into
change_mnt_propagation() itself.  What remains is switching of
what used to get propagation *through* mnt to alternative source.
Rename to transfer_propagation(), passing it the replacement source
as the second argument.  Have it return void, while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index 91d10af867bdef..0a54848cbbd158 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -83,19 +83,10 @@ static struct mount *propagation_source(struct mount *mnt)
 	return mnt;
 }
 
-static int do_make_slave(struct mount *mnt)
+static void transfer_propagation(struct mount *mnt, struct mount *to)
 {
-	struct mount *master = propagation_source(mnt);
 	struct mount *slave_mnt;
-
-	if (list_empty(&mnt->mnt_share)) {
-		mnt_release_group_id(mnt);
-	} else {
-		list_del_init(&mnt->mnt_share);
-		mnt->mnt_group_id = 0;
-	}
-	CLEAR_MNT_SHARED(mnt);
-	if (!master) {
+	if (!to) {
 		struct list_head *p = &mnt->mnt_slave_list;
 		while (!list_empty(p)) {
 			slave_mnt = list_first_entry(p,
@@ -103,14 +94,12 @@ static int do_make_slave(struct mount *mnt)
 			list_del_init(&slave_mnt->mnt_slave);
 			slave_mnt->mnt_master = NULL;
 		}
-		return 0;
+		return;
 	}
 	list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
-		slave_mnt->mnt_master = master;
-	list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
+		slave_mnt->mnt_master = to;
+	list_splice(&mnt->mnt_slave_list, to->mnt_slave_list.prev);
 	INIT_LIST_HEAD(&mnt->mnt_slave_list);
-	mnt->mnt_master = master;
-	return 0;
 }
 
 /*
@@ -122,8 +111,19 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		set_mnt_shared(mnt);
 		return;
 	}
-	if (IS_MNT_SHARED(mnt))
-		do_make_slave(mnt);
+	if (IS_MNT_SHARED(mnt)) {
+		struct mount *m = propagation_source(mnt);
+
+		if (list_empty(&mnt->mnt_share)) {
+			mnt_release_group_id(mnt);
+		} else {
+			list_del_init(&mnt->mnt_share);
+			mnt->mnt_group_id = 0;
+		}
+		CLEAR_MNT_SHARED(mnt);
+		transfer_propagation(mnt, m);
+		mnt->mnt_master = m;
+	}
 	list_del_init(&mnt->mnt_slave);
 	if (type == MS_SLAVE) {
 		if (mnt->mnt_master)
-- 
cgit 1.2.3-korg


From 8c5a853f58c5b86b033842b78a0ad3d1208672fa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Jun 2025 23:51:31 -0400
Subject: mnt_slave_list/mnt_slave: turn into hlist_head/hlist_node

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mount.h     |  4 ++--
 fs/namespace.c | 14 ++++++--------
 fs/pnode.c     | 41 +++++++++++++++++++----------------------
 3 files changed, 27 insertions(+), 32 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/mount.h b/fs/mount.h
index f299dc85446d70..08583428b10b55 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -69,8 +69,8 @@ struct mount {
 	struct list_head mnt_list;
 	struct list_head mnt_expire;	/* link in fs-specific expiry list */
 	struct list_head mnt_share;	/* circular list of shared mounts */
-	struct list_head mnt_slave_list;/* list of slave mounts */
-	struct list_head mnt_slave;	/* slave list entry */
+	struct hlist_head mnt_slave_list;/* list of slave mounts */
+	struct hlist_node mnt_slave;	/* slave list entry */
 	struct mount *mnt_master;	/* slave is on master->mnt_slave_list */
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
diff --git a/fs/namespace.c b/fs/namespace.c
index da27365418a5a6..38a46b32413d02 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -380,8 +380,8 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_list);
 		INIT_LIST_HEAD(&mnt->mnt_expire);
 		INIT_LIST_HEAD(&mnt->mnt_share);
-		INIT_LIST_HEAD(&mnt->mnt_slave_list);
-		INIT_LIST_HEAD(&mnt->mnt_slave);
+		INIT_HLIST_HEAD(&mnt->mnt_slave_list);
+		INIT_HLIST_NODE(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
 		RB_CLEAR_NODE(&mnt->mnt_node);
@@ -1348,10 +1348,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 
 	if ((flag & CL_SLAVE) ||
 	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
-		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
+		hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list);
 		mnt->mnt_master = old;
 	} else if (IS_MNT_SLAVE(old)) {
-		list_add(&mnt->mnt_slave, &old->mnt_slave);
+		hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave);
 		mnt->mnt_master = old->mnt_master;
 	}
 	return mnt;
@@ -3398,10 +3398,8 @@ static int do_set_group(struct path *from_path, struct path *to_path)
 		goto out;
 
 	if (IS_MNT_SLAVE(from)) {
-		struct mount *m = from->mnt_master;
-
-		list_add(&to->mnt_slave, &from->mnt_slave);
-		to->mnt_master = m;
+		hlist_add_behind(&to->mnt_slave, &from->mnt_slave);
+		to->mnt_master = from->mnt_master;
 	}
 
 	if (IS_MNT_SHARED(from)) {
diff --git a/fs/pnode.c b/fs/pnode.c
index 0a54848cbbd158..69278079faeb7d 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -21,12 +21,12 @@ static inline struct mount *next_peer(struct mount *p)
 
 static inline struct mount *first_slave(struct mount *p)
 {
-	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
+	return hlist_entry(p->mnt_slave_list.first, struct mount, mnt_slave);
 }
 
 static inline struct mount *next_slave(struct mount *p)
 {
-	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
+	return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave);
 }
 
 static struct mount *get_peer_under_root(struct mount *mnt,
@@ -85,21 +85,18 @@ static struct mount *propagation_source(struct mount *mnt)
 
 static void transfer_propagation(struct mount *mnt, struct mount *to)
 {
-	struct mount *slave_mnt;
-	if (!to) {
-		struct list_head *p = &mnt->mnt_slave_list;
-		while (!list_empty(p)) {
-			slave_mnt = list_first_entry(p,
-					struct mount, mnt_slave);
-			list_del_init(&slave_mnt->mnt_slave);
-			slave_mnt->mnt_master = NULL;
-		}
-		return;
+	struct hlist_node *p = NULL, *n;
+	struct mount *m;
+
+	hlist_for_each_entry_safe(m, n, &mnt->mnt_slave_list, mnt_slave) {
+		m->mnt_master = to;
+		if (!to)
+			hlist_del_init(&m->mnt_slave);
+		else
+			p = &m->mnt_slave;
 	}
-	list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
-		slave_mnt->mnt_master = to;
-	list_splice(&mnt->mnt_slave_list, to->mnt_slave_list.prev);
-	INIT_LIST_HEAD(&mnt->mnt_slave_list);
+	if (p)
+		hlist_splice_init(&mnt->mnt_slave_list, p, &to->mnt_slave_list);
 }
 
 /*
@@ -124,10 +121,10 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		transfer_propagation(mnt, m);
 		mnt->mnt_master = m;
 	}
-	list_del_init(&mnt->mnt_slave);
+	hlist_del_init(&mnt->mnt_slave);
 	if (type == MS_SLAVE) {
 		if (mnt->mnt_master)
-			list_add(&mnt->mnt_slave,
+			hlist_add_head(&mnt->mnt_slave,
 				 &mnt->mnt_master->mnt_slave_list);
 	} else {
 		mnt->mnt_master = NULL;
@@ -147,7 +144,7 @@ static struct mount *__propagation_next(struct mount *m,
 		if (master == origin->mnt_master) {
 			struct mount *next = next_peer(m);
 			return (next == origin) ? NULL : next;
-		} else if (m->mnt_slave.next != &master->mnt_slave_list)
+		} else if (m->mnt_slave.next)
 			return next_slave(m);
 
 		/* back at master */
@@ -169,7 +166,7 @@ static struct mount *propagation_next(struct mount *m,
 					 struct mount *origin)
 {
 	/* are there any slaves of this mount? */
-	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+	if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list))
 		return first_slave(m);
 
 	return __propagation_next(m, origin);
@@ -194,7 +191,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 	while (1) {
 		while (1) {
 			struct mount *next;
-			if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+			if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list))
 				return first_slave(m);
 			next = next_peer(m);
 			if (m->mnt_group_id == origin->mnt_group_id) {
@@ -207,7 +204,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 		/* m is the last peer */
 		while (1) {
 			struct mount *master = m->mnt_master;
-			if (m->mnt_slave.next != &master->mnt_slave_list)
+			if (m->mnt_slave.next)
 				return next_slave(m);
 			m = next_peer(master);
 			if (master->mnt_group_id == origin->mnt_group_id)
-- 
cgit 1.2.3-korg


From dd5a4e1d640bf3542c4583491e6b91d25de3b760 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 24 Jun 2025 23:54:41 -0400
Subject: change_mnt_propagation(): move ->mnt_master assignment into MS_SLAVE
 case

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/pnode.c b/fs/pnode.c
index 69278079faeb7d..cbf5f5746252d6 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -104,13 +104,14 @@ static void transfer_propagation(struct mount *mnt, struct mount *to)
  */
 void change_mnt_propagation(struct mount *mnt, int type)
 {
+	struct mount *m = mnt->mnt_master;
+
 	if (type == MS_SHARED) {
 		set_mnt_shared(mnt);
 		return;
 	}
 	if (IS_MNT_SHARED(mnt)) {
-		struct mount *m = propagation_source(mnt);
-
+		m = propagation_source(mnt);
 		if (list_empty(&mnt->mnt_share)) {
 			mnt_release_group_id(mnt);
 		} else {
@@ -119,13 +120,12 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		}
 		CLEAR_MNT_SHARED(mnt);
 		transfer_propagation(mnt, m);
-		mnt->mnt_master = m;
 	}
 	hlist_del_init(&mnt->mnt_slave);
 	if (type == MS_SLAVE) {
-		if (mnt->mnt_master)
-			hlist_add_head(&mnt->mnt_slave,
-				 &mnt->mnt_master->mnt_slave_list);
+		mnt->mnt_master = m;
+		if (m)
+			hlist_add_head(&mnt->mnt_slave, &m->mnt_slave_list);
 	} else {
 		mnt->mnt_master = NULL;
 		if (type == MS_UNBINDABLE)
-- 
cgit 1.2.3-korg


From 663206854f020ec6fc6bfd3d52f501a28ede1403 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 17 Jun 2025 21:35:22 -0400
Subject: copy_tree(): don't link the mounts via mnt_list

The only place that really needs to be adjusted is commit_tree() -
there we need to iterate through the copy and we might as well
use next_mnt() for that.  However, in case when our tree has been
slid under something already mounted (propagation to a mountpoint
that already has something mounted on it or a 'beneath' move_mount)
we need to take care not to walk into the overmounting tree.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mount.h     |  3 +--
 fs/namespace.c | 60 +++++++++++++++++++++++-----------------------------------
 fs/pnode.c     |  3 ++-
 3 files changed, 27 insertions(+), 39 deletions(-)

(limited to 'fs/pnode.c')

diff --git a/fs/mount.h b/fs/mount.h
index 08583428b10b55..97737051a8b9df 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -193,7 +193,7 @@ static inline bool mnt_ns_empty(const struct mnt_namespace *ns)
 	return RB_EMPTY_ROOT(&ns->mounts);
 }
 
-static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
+static inline void move_from_ns(struct mount *mnt)
 {
 	struct mnt_namespace *ns = mnt->mnt_ns;
 	WARN_ON(!mnt_ns_attached(mnt));
@@ -203,7 +203,6 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
 		ns->mnt_first_node = rb_next(&mnt->mnt_node);
 	rb_erase(&mnt->mnt_node, &ns->mounts);
 	RB_CLEAR_NODE(&mnt->mnt_node);
-	list_add_tail(&mnt->mnt_list, dt_list);
 }
 
 bool has_locked_children(struct mount *mnt, struct dentry *dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index 38a46b32413d02..bd6c7da901fcab 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1161,34 +1161,6 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
 	mnt_notify_add(mnt);
 }
 
-/*
- * vfsmount lock must be held for write
- */
-static void commit_tree(struct mount *mnt)
-{
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
-	LIST_HEAD(head);
-	struct mnt_namespace *n = parent->mnt_ns;
-
-	BUG_ON(parent == mnt);
-
-	if (!mnt_ns_attached(mnt)) {
-		list_add_tail(&head, &mnt->mnt_list);
-		while (!list_empty(&head)) {
-			m = list_first_entry(&head, typeof(*m), mnt_list);
-			list_del(&m->mnt_list);
-
-			mnt_add_to_ns(n, m);
-		}
-		n->nr_mounts += n->pending_mounts;
-		n->pending_mounts = 0;
-	}
-
-	make_visible(mnt);
-	touch_mnt_namespace(n);
-}
-
 static struct mount *next_mnt(struct mount *p, struct mount *root)
 {
 	struct list_head *next = p->mnt_mounts.next;
@@ -1215,6 +1187,27 @@ static struct mount *skip_mnt_tree(struct mount *p)
 	return p;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
+static void commit_tree(struct mount *mnt)
+{
+	struct mnt_namespace *n = mnt->mnt_parent->mnt_ns;
+
+	if (!mnt_ns_attached(mnt)) {
+		for (struct mount *m = mnt; m; m = next_mnt(m, mnt))
+			if (unlikely(mnt_ns_attached(m)))
+				m = skip_mnt_tree(m);
+			else
+				mnt_add_to_ns(n, m);
+		n->nr_mounts += n->pending_mounts;
+		n->pending_mounts = 0;
+	}
+
+	make_visible(mnt);
+	touch_mnt_namespace(n);
+}
+
 /**
  * vfs_create_mount - Create a mount for a configured superblock
  * @fc: The configuration context with the superblock attached
@@ -1831,9 +1824,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		p->mnt.mnt_flags |= MNT_UMOUNT;
 		if (mnt_ns_attached(p))
-			move_from_ns(p, &tmp_list);
-		else
-			list_move(&p->mnt_list, &tmp_list);
+			move_from_ns(p);
+		list_add_tail(&p->mnt_list, &tmp_list);
 	}
 
 	/* Hide the mounts from mnt_mounts */
@@ -2270,7 +2262,6 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
 					list_add(&dst_mnt->mnt_expire,
 						 &src_mnt->mnt_expire);
 			}
-			list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
 			attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp);
 			unlock_mount_hash();
 		}
@@ -2686,12 +2677,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		list_del_init(&source_mnt->mnt_expire);
 	} else {
 		if (source_mnt->mnt_ns) {
-			LIST_HEAD(head);
-
 			/* move from anon - the caller will destroy */
 			for (p = source_mnt; p; p = next_mnt(p, source_mnt))
-				move_from_ns(p, &head);
-			list_del_init(&head);
+				move_from_ns(p);
 		}
 	}
 
diff --git a/fs/pnode.c b/fs/pnode.c
index cbf5f5746252d6..81f7599bdac4fc 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -449,7 +449,8 @@ static void umount_one(struct mount *m, struct list_head *to_umount)
 {
 	m->mnt.mnt_flags |= MNT_UMOUNT;
 	list_del_init(&m->mnt_child);
-	move_from_ns(m, to_umount);
+	move_from_ns(m);
+	list_add_tail(&m->mnt_list, to_umount);
 }
 
 static void remove_from_candidate_list(struct mount *m)
-- 
cgit 1.2.3-korg