Set next multixid's offset when creating a new multixid

author Heikki Linnakangas <heikki.linnakangas@iki.fi>

Wed, 3 Dec 2025 17:15:08 +0000 (19:15 +0200)

committer Heikki Linnakangas <heikki.linnakangas@iki.fi>

Wed, 3 Dec 2025 17:15:08 +0000 (19:15 +0200)
author Heikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 3 Dec 2025 17:15:08 +0000 (19:15 +0200)
committer Heikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 3 Dec 2025 17:15:08 +0000 (19:15 +0200)
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c

index 9d5f130af7efeeb057a3990d6ecd728e4e233124..27f02faec80f0cd651911ceb34c9cb169c900f52 100644 (file)
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -79,7 +79,6 @@
  #include "pg_trace.h"
  #include "pgstat.h"
  #include "postmaster/autovacuum.h"
-#include "storage/condition_variable.h"
  #include "storage/pmsignal.h"
  #include "storage/proc.h"
  #include "storage/procarray.h"
@@ -271,12 +270,6 @@ typedef struct MultiXactStateData
     /* support for members anti-wraparound measures */
     MultiXactOffset offsetStopLimit;    /* known if oldestOffsetKnown */
  
-   /*
-    * This is used to sleep until a multixact offset is written when we want
-    * to create the next one.
-    */
-   ConditionVariable nextoff_cv;
-
     /*
      * Per-backend data starts here.  We have two arrays stored in the area
      * immediately following the MultiXactStateData struct. Each is indexed by
@@ -912,13 +905,33 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
     int         entryno;
     int         slotno;
     MultiXactOffset *offptr;
-   int         i;
+   MultiXactId next;
+   int64       next_pageno;
+   int         next_entryno;
+   MultiXactOffset *next_offptr;
     LWLock     *lock;
     LWLock     *prevlock = NULL;
  
+   /* position of this multixid in the offsets SLRU area  */
     pageno = MultiXactIdToOffsetPage(multi);
     entryno = MultiXactIdToOffsetEntry(multi);
  
+   /* position of the next multixid */
+   next = multi + 1;
+   if (next < FirstMultiXactId)
+       next = FirstMultiXactId;
+   next_pageno = MultiXactIdToOffsetPage(next);
+   next_entryno = MultiXactIdToOffsetEntry(next);
+
+   /*
+    * Set the starting offset of this multixid's members.
+    *
+    * In the common case, it was already be set by the previous
+    * RecordNewMultiXact call, as this was the next multixid of the previous
+    * multixid.  But if multiple backends are generating multixids
+    * concurrently, we might race ahead and get called before the previous
+    * multixid.
+    */
     lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
     LWLockAcquire(lock, LW_EXCLUSIVE);
  
@@ -933,22 +946,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
     offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
     offptr += entryno;
  
-   *offptr = offset;
+   if (*offptr != offset)
+   {
+       /* should already be set to the correct value, or not at all */
+       Assert(*offptr == 0);
+       *offptr = offset;
+       MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+   }
+
+   /*
+    * Set the next multixid's offset to the end of this multixid's members.
+    */
+   if (next_pageno == pageno)
+   {
+       next_offptr = offptr + 1;
+   }
+   else
+   {
+       /* must be the first entry on the page */
+       Assert(next_entryno == 0 || next == FirstMultiXactId);
+
+       /* Swap the lock for a lock on the next page */
+       LWLockRelease(lock);
+       lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
+       LWLockAcquire(lock, LW_EXCLUSIVE);
+
+       slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
+       next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+       next_offptr += next_entryno;
+   }
  
-   MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+   if (*next_offptr != offset + nmembers)
+   {
+       /* should already be set to the correct value, or not at all */
+       Assert(*next_offptr == 0);
+       *next_offptr = offset + nmembers;
+       MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+   }
  
     /* Release MultiXactOffset SLRU lock. */
     LWLockRelease(lock);
  
-   /*
-    * If anybody was waiting to know the offset of this multixact ID we just
-    * wrote, they can read it now, so wake them up.
-    */
-   ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
-
     prev_pageno = -1;
  
-   for (i = 0; i < nmembers; i++, offset++)
+   for (int i = 0; i < nmembers; i++, offset++)
     {
         TransactionId *memberptr;
         uint32     *flagsptr;
@@ -1138,8 +1179,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
             result = FirstMultiXactId;
     }
  
-   /* Make sure there is room for the MXID in the file.  */
-   ExtendMultiXactOffset(result);
+   /*
+    * Make sure there is room for the next MXID in the file.  Assigning this
+    * MXID sets the next MXID's offset already.
+    */
+   ExtendMultiXactOffset(result + 1);
  
     /*
      * Reserve the members space, similarly to above.  Also, be careful not to
@@ -1300,11 +1344,8 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
     int         truelength;
     MultiXactId oldestMXact;
     MultiXactId nextMXact;
-   MultiXactId tmpMXact;
-   MultiXactOffset nextOffset;
     MultiXactMember *ptr;
     LWLock     *lock;
-   bool        slept = false;
  
     debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
  
@@ -1351,14 +1392,12 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
      * error.
      *
      * Shared lock is enough here since we aren't modifying any global state.
-    * Acquire it just long enough to grab the current counter values.  We may
-    * need both nextMXact and nextOffset; see below.
+    * Acquire it just long enough to grab the current counter values.
      */
     LWLockAcquire(MultiXactGenLock, LW_SHARED);
  
     oldestMXact = MultiXactState->oldestMultiXactId;
     nextMXact = MultiXactState->nextMXact;
-   nextOffset = MultiXactState->nextOffset;
  
     LWLockRelease(MultiXactGenLock);
  
@@ -1378,38 +1417,17 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
      * Find out the offset at which we need to start reading MultiXactMembers
      * and the number of members in the multixact.  We determine the latter as
      * the difference between this multixact's starting offset and the next
-    * one's.  However, there are some corner cases to worry about:
+    * one's.  However, there is one corner case to worry about:
      *
-    * 1. This multixact may be the latest one created, in which case there is
-    * no next one to look at.  In this case the nextOffset value we just
-    * saved is the correct endpoint.
-    *
-    * 2. The next multixact may still be in process of being filled in: that
-    * is, another process may have done GetNewMultiXactId but not yet written
-    * the offset entry for that ID.  In that scenario, it is guaranteed that
-    * the offset entry for that multixact exists (because GetNewMultiXactId
-    * won't release MultiXactGenLock until it does) but contains zero
-    * (because we are careful to pre-zero offset pages). Because
-    * GetNewMultiXactId will never return zero as the starting offset for a
-    * multixact, when we read zero as the next multixact's offset, we know we
-    * have this case.  We handle this by sleeping on the condition variable
-    * we have just for this; the process in charge will signal the CV as soon
-    * as it has finished writing the multixact offset.
-    *
-    * 3. Because GetNewMultiXactId increments offset zero to offset one to
-    * handle case #2, there is an ambiguity near the point of offset
+    * Because GetNewMultiXactId skips over offset zero, to reserve zero for
+    * to mean "unset", there is an ambiguity near the point of offset
      * wraparound.  If we see next multixact's offset is one, is that our
      * multixact's actual endpoint, or did it end at zero with a subsequent
      * increment?  We handle this using the knowledge that if the zero'th
      * member slot wasn't filled, it'll contain zero, and zero isn't a valid
      * transaction ID so it can't be a multixact member.  Therefore, if we
      * read a zero from the members array, just ignore it.
-    *
-    * This is all pretty messy, but the mess occurs only in infrequent corner
-    * cases, so it seems better than holding the MultiXactGenLock for a long
-    * time on every multixact creation.
      */
-retry:
     pageno = MultiXactIdToOffsetPage(multi);
     entryno = MultiXactIdToOffsetEntry(multi);
  
@@ -1417,6 +1435,7 @@ retry:
     lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
     LWLockAcquire(lock, LW_EXCLUSIVE);
  
+   /* read this multi's offset */
     slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
     offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
     offptr += entryno;
@@ -1424,22 +1443,13 @@ retry:
  
     Assert(offset != 0);
  
-   /*
-    * Use the same increment rule as GetNewMultiXactId(), that is, don't
-    * handle wraparound explicitly until needed.
-    */
-   tmpMXact = multi + 1;
-
-   if (nextMXact == tmpMXact)
-   {
-       /* Corner case 1: there is no next multixact */
-       length = nextOffset - offset;
-   }
-   else
+   /* read next multi's offset */
     {
+       MultiXactId tmpMXact;
         MultiXactOffset nextMXOffset;
  
         /* handle wraparound if needed */
+       tmpMXact = multi + 1;
         if (tmpMXact < FirstMultiXactId)
             tmpMXact = FirstMultiXactId;
  
@@ -1472,18 +1482,10 @@ retry:
         nextMXOffset = *offptr;
  
         if (nextMXOffset == 0)
-       {
-           /* Corner case 2: next multixact is still being filled in */
-           LWLockRelease(lock);
-           CHECK_FOR_INTERRUPTS();
-
-           INJECTION_POINT("multixact-get-members-cv-sleep", NULL);
-
-           ConditionVariableSleep(&MultiXactState->nextoff_cv,
-                                  WAIT_EVENT_MULTIXACT_CREATION);
-           slept = true;
-           goto retry;
-       }
+           ereport(ERROR,
+                   (errcode(ERRCODE_DATA_CORRUPTED),
+                    errmsg("MultiXact %u has invalid next offset",
+                           multi)));
  
         length = nextMXOffset - offset;
     }
@@ -1491,12 +1493,7 @@ retry:
     LWLockRelease(lock);
     lock = NULL;
  
-   /*
-    * If we slept above, clean up state; it's no longer needed.
-    */
-   if (slept)
-       ConditionVariableCancelSleep();
-
+   /* read the members */
     ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
  
     truelength = 0;
@@ -1539,7 +1536,7 @@ retry:
  
         if (!TransactionIdIsValid(*xactptr))
         {
-           /* Corner case 3: we must be looking at unused slot zero */
+           /* Corner case: we must be looking at unused slot zero */
             Assert(offset == 0);
             continue;
         }
@@ -1986,7 +1983,6 @@ MultiXactShmemInit(void)
  
         /* Make sure we zero out the per-backend state */
         MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
-       ConditionVariableInit(&MultiXactState->nextoff_cv);
     }
     else
         Assert(found);
@@ -2132,26 +2128,34 @@ TrimMultiXact(void)
                         pageno);
  
     /*
-    * Zero out the remainder of the current offsets page.  See notes in
-    * TrimCLOG() for background.  Unlike CLOG, some WAL record covers every
-    * pg_multixact SLRU mutation.  Since, also unlike CLOG, we ignore the WAL
-    * rule "write xlog before data," nextMXact successors may carry obsolete,
-    * nonzero offset values.  Zero those so case 2 of GetMultiXactIdMembers()
-    * operates normally.
+    * Set the offset of nextMXact on the offsets page.  This is normally done
+    * in RecordNewMultiXact() of the previous multixact, but let's be sure
+    * the next page exists, if the nextMXact was reset with pg_resetwal for
+    * example.
+    *
+    * Zero out the remainder of the page.  See notes in TrimCLOG() for
+    * background.  Unlike CLOG, some WAL record covers every pg_multixact
+    * SLRU mutation.  Since, also unlike CLOG, we ignore the WAL rule "write
+    * xlog before data," nextMXact successors may carry obsolete, nonzero
+    * offset values.
      */
     entryno = MultiXactIdToOffsetEntry(nextMXact);
-   if (entryno != 0)
     {
         int         slotno;
         MultiXactOffset *offptr;
         LWLock     *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
  
         LWLockAcquire(lock, LW_EXCLUSIVE);
-       slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
+       if (entryno == 0)
+           slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+       else
+           slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
         offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
         offptr += entryno;
  
-       MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+       *offptr = offset;
+       if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
+           MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
  
         MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
         LWLockRelease(lock);
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h

index 34deb2fe5f04d734b409ef582fb3da2d33f36d60..3be5ecf336f6fd54f3c793a45fe0245d477d3faa 100644 (file)
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -31,7 +31,7 @@
  /*
   * Each page of XLOG file has a header like this:
   */
-#define XLOG_PAGE_MAGIC 0xD119 /* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD11A /* can be used as WAL version indicator */
  
  typedef struct XLogPageHeaderData
  {
diff --git a/src/test/modules/test_slru/t/001_multixact.pl b/src/test/modules/test_slru/t/001_multixact.pl

index e2b567a603d31399891c218a3446c87248179531..7837eb810f0181ef48eeada615ad18fe31b49ad2 100644 (file)
--- a/src/test/modules/test_slru/t/001_multixact.pl
+++ b/src/test/modules/test_slru/t/001_multixact.pl
@@ -1,10 +1,6 @@
  # Copyright (c) 2024-2025, PostgreSQL Global Development Group
  
-# This test verifies edge case of reading a multixact:
-# when we have multixact that is followed by exactly one another multixact,
-# and another multixact have no offset yet, we must wait until this offset
-# becomes observable. Previously we used to wait for 1ms in a loop in this
-# case, but now we use CV for this. This test is exercising such a sleep.
+# Test multixid corner cases.
  
  use strict;
  use warnings FATAL => 'all';
@@ -19,9 +15,7 @@ if ($ENV{enable_injection_points} ne 'yes')
     plan skip_all => 'Injection points not supported by this build';
  }
  
-my ($node, $result);
-
-$node = PostgreSQL::Test::Cluster->new('mike');
+my $node = PostgreSQL::Test::Cluster->new('main');
  $node->init;
  $node->append_conf('postgresql.conf',
     "shared_preload_libraries = 'test_slru,injection_points'");
@@ -29,95 +23,47 @@ $node->start;
  $node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
  $node->safe_psql('postgres', q(CREATE EXTENSION test_slru));
  
-# Test for Multixact generation edge case
-$node->safe_psql('postgres',
-   q{select injection_points_attach('test-multixact-read','wait')});
-$node->safe_psql('postgres',
-   q{select injection_points_attach('multixact-get-members-cv-sleep','wait')}
-);
+# This test creates three multixacts. The middle one is never
+# WAL-logged or recorded on the offsets page, because we pause the
+# backend and crash the server before that. After restart, verify that
+# the other multixacts are readable, despite the middle one being
+# lost.
  
-# This session must observe sleep on the condition variable while generating a
-# multixact.  To achieve this it first will create a multixact, then pause
-# before reading it.
-my $observer = $node->background_psql('postgres');
-
-# This query will create a multixact, and hang just before reading it.
-$observer->query_until(
-   qr/start/,
-   q{
-   \echo start
-   SELECT test_read_multixact(test_create_multixact());
-});
-$node->wait_for_event('client backend', 'test-multixact-read');
-
-# This session will create the next Multixact. This is necessary to avoid
-# multixact.c's non-sleeping edge case 1.
-my $creator = $node->background_psql('postgres');
+# Create the first multixact
+my $bg_psql = $node->background_psql('postgres');
+my $multi1 = $bg_psql->query_safe(q(SELECT test_create_multixact();));
+
+# Assign the middle multixact. Use an injection point to prevent it
+# from being fully recorded.
  $node->safe_psql('postgres',
     q{SELECT injection_points_attach('multixact-create-from-members','wait');}
  );
  
-# We expect this query to hang in the critical section after generating new
-# multixact, but before filling its offset into SLRU.
-# Running an injection point inside a critical section requires it to be
-# loaded beforehand.
-$creator->query_until(
-   qr/start/, q{
-   \echo start
+$bg_psql->query_until(
+   qr/assigning lost multi/, q(
+\echo assigning lost multi
     SELECT test_create_multixact();
-});
+));
  
  $node->wait_for_event('client backend', 'multixact-create-from-members');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
-       'postgres',
-       q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
-       FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
-   ),
-   'multixact-create-from-members, test-multixact-read',
-   "matching injection point waits");
-
-# Now wake observer to get it to read the initial multixact.  A subsequent
-# multixact already exists, but that one doesn't have an offset assigned, so
-# this will hit multixact.c's edge case 2.
-$node->safe_psql('postgres',
-   q{SELECT injection_points_wakeup('test-multixact-read')});
-$node->wait_for_event('client backend', 'multixact-get-members-cv-sleep');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
-       'postgres',
-       q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
-       FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
-   ),
-   'multixact-create-from-members, multixact-get-members-cv-sleep',
-   "matching injection point waits");
-
-# Now we have two backends waiting in multixact-create-from-members and
-# multixact-get-members-cv-sleep.  Also we have 3 injections points set to wait.
-# If we wakeup multixact-get-members-cv-sleep it will happen again, so we must
-# detach it first. So let's detach all injection points, then wake up all
-# backends.
-
-$node->safe_psql('postgres',
-   q{SELECT injection_points_detach('test-multixact-read')});
  $node->safe_psql('postgres',
     q{SELECT injection_points_detach('multixact-create-from-members')});
-$node->safe_psql('postgres',
-   q{SELECT injection_points_detach('multixact-get-members-cv-sleep')});
  
-$node->safe_psql('postgres',
-   q{SELECT injection_points_wakeup('multixact-create-from-members')});
-$node->safe_psql('postgres',
-   q{SELECT injection_points_wakeup('multixact-get-members-cv-sleep')});
+# Create the third multixid
+my $multi2 = $node->safe_psql('postgres', q{SELECT test_create_multixact();});
+
+# All set and done, it's time for hard restart
+$node->stop('immediate');
+$node->start;
+$bg_psql->{run}->finish;
  
-# Background psql will now be able to read the result and disconnect.
-$observer->quit;
-$creator->quit;
+# Verify that the recorded multixids are readable
+is( $node->safe_psql('postgres', qq{SELECT test_read_multixact('$multi1');}),
+   '',
+   'first recorded multi is readable');
  
-$node->stop;
+is( $node->safe_psql('postgres', qq{SELECT test_read_multixact('$multi2');}),
+   '',
+   'second recorded multi is readable');
  
-# If we reached this point - everything is OK.
-ok(1);
  done_testing();
diff --git a/src/test/modules/test_slru/test_multixact.c b/src/test/modules/test_slru/test_multixact.c

index 6c9b0420717cc59ce140f10a46b022fe3d1bf9d5..8fb6c19d70fd3a1175741a93a43fc5f2f2bb7d44 100644 (file)
--- a/src/test/modules/test_slru/test_multixact.c
+++ b/src/test/modules/test_slru/test_multixact.c
@@ -17,7 +17,6 @@
  #include "access/multixact.h"
  #include "access/xact.h"
  #include "fmgr.h"
-#include "utils/injection_point.h"
  
  PG_FUNCTION_INFO_V1(test_create_multixact);
  PG_FUNCTION_INFO_V1(test_read_multixact);
@@ -37,8 +36,7 @@ test_create_multixact(PG_FUNCTION_ARGS)
  }
  
  /*
- * Reads given multixact after running an injection point. Discards local cache
- * to make a real read.  Tailored for multixact testing.
+ * Reads given multixact.  Discards local cache to make a real read.
   */
  Datum
  test_read_multixact(PG_FUNCTION_ARGS)
@@ -46,7 +44,6 @@ test_read_multixact(PG_FUNCTION_ARGS)
     MultiXactId id = PG_GETARG_TRANSACTIONID(0);
     MultiXactMember *members;
  
-   INJECTION_POINT("test-multixact-read", NULL);
     /* discard caches */
     AtEOXact_MultiXact();
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Wed, 3 Dec 2025 17:15:08 +0000 (19:15 +0200)
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Wed, 3 Dec 2025 17:15:08 +0000 (19:15 +0200)
src/backend/access/transam/multixact.c		patch \| blob \| blame \| history
src/include/access/xlog_internal.h		patch \| blob \| blame \| history
src/test/modules/test_slru/t/001_multixact.pl		patch \| blob \| blame \| history
src/test/modules/test_slru/test_multixact.c		patch \| blob \| blame \| history