Add TAP test to check recovery when redo LSN is missing

author Michael Paquier <michael@paquier.xyz>

Tue, 16 Dec 2025 05:28:05 +0000 (14:28 +0900)

committer Michael Paquier <michael@paquier.xyz>

Tue, 16 Dec 2025 05:28:05 +0000 (14:28 +0900)
author Michael Paquier <michael@paquier.xyz>
Tue, 16 Dec 2025 05:28:05 +0000 (14:28 +0900)
committer Michael Paquier <michael@paquier.xyz>
Tue, 16 Dec 2025 05:28:05 +0000 (14:28 +0900)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index 6a5640df51afc359c321be00cd021705dc908089..430a38b1a216acff593235544eca44693ce28f08 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7001,6 +7001,10 @@ CreateCheckPoint(int flags)
      */
     SyncPreCheckpoint();
  
+   /* Run these points outside the critical section. */
+   INJECTION_POINT("create-checkpoint-initial", NULL);
+   INJECTION_POINT_LOAD("create-checkpoint-run");
+
     /*
      * Use a critical section to force system panic if we have trouble.
      */
@@ -7151,6 +7155,8 @@ CreateCheckPoint(int flags)
     if (log_checkpoints)
         LogCheckpointStart(flags, false);
  
+   INJECTION_POINT_CACHED("create-checkpoint-run", NULL);
+
     /* Update the process title */
     update_checkpoint_display(flags, false, false);
  
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c

index 9563b3e5c12c4ea944f51c1e31bc85b6f3bc573f..38b594d2170923fe73c1a29cf88ed8c8e5419050 100644 (file)
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -811,7 +811,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
         {
             XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
             if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
-               ereport(PANIC,
+               ereport(FATAL,
                         errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
                                LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)));
         }
diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build

index 523a5cd5b52795b3240970c60f431f1a9b7ee3cd..e93248bd66e22893cbb194f3db8efa3176596f45 100644 (file)
--- a/src/test/recovery/meson.build
+++ b/src/test/recovery/meson.build
@@ -58,6 +58,7 @@ tests += {
        't/047_checkpoint_physical_slot.pl',
        't/048_vacuum_horizon_floor.pl',
        't/049_wait_for_lsn.pl',
+      't/050_redo_segment_missing.pl',
      ],
    },
  }
diff --git a/src/test/recovery/t/050_redo_segment_missing.pl b/src/test/recovery/t/050_redo_segment_missing.pl

new file mode 100644 (file)

index 0000000..f5eb6c3
--- /dev/null
+++ b/src/test/recovery/t/050_redo_segment_missing.pl
@@ -0,0 +1,117 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+#
+# Evaluates PostgreSQL's recovery behavior when a WAL segment containing the
+# redo record is missing, with a checkpoint record located in a different
+# segment.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+if ($ENV{enable_injection_points} ne 'yes')
+{
+   plan skip_all => 'Injection points not supported by this build';
+}
+
+my $node = PostgreSQL::Test::Cluster->new('testnode');
+$node->init;
+$node->append_conf('postgresql.conf', 'log_checkpoints = on');
+$node->start;
+
+# Check if the extension injection_points is available, as it may be
+# possible that this script is run with installcheck, where the module
+# would not be installed by default.
+if (!$node->check_extension('injection_points'))
+{
+   plan skip_all => 'Extension injection_points not installed';
+}
+$node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
+
+# Note that this uses two injection points based on waits, not one.  This
+# may look strange, but this works as a workaround to enforce all memory
+# allocations to happen outside the critical section of the checkpoint
+# required for this test.
+# First, "create-checkpoint-initial" is run outside the critical section
+# section, and is used as a way to initialize the shared memory required
+# for the wait machinery with its DSM registry.
+# Then, "create-checkpoint-run" is loaded outside the critical section of
+# a checkpoint to allocate any memory required by the library load, and
+# its callback is run inside the critical section.
+$node->safe_psql('postgres',
+   q{select injection_points_attach('create-checkpoint-initial', 'wait')});
+$node->safe_psql('postgres',
+   q{select injection_points_attach('create-checkpoint-run', 'wait')});
+
+# Start a psql session to run the checkpoint in the background and make
+# the test wait on the injection point so the checkpoint stops just after
+# it starts.
+my $checkpoint = $node->background_psql('postgres');
+$checkpoint->query_until(
+   qr/starting_checkpoint/,
+   q(\echo starting_checkpoint
+checkpoint;
+));
+
+# Wait for the initial point to finish, the checkpointer is still
+# outside its critical section.  Then release to reach the second
+# point.
+$node->wait_for_event('checkpointer', 'create-checkpoint-initial');
+$node->safe_psql('postgres',
+   q{select injection_points_wakeup('create-checkpoint-initial')});
+
+# Wait until the checkpoint has reached the second injection point.
+# We are now in the middle of a checkpoint running, after the redo
+# record has been logged.
+$node->wait_for_event('checkpointer', 'create-checkpoint-run');
+
+# Switch the WAL segment, ensuring that the redo record will be included
+# in a different segment than the checkpoint record.
+$node->safe_psql('postgres', 'SELECT pg_switch_wal()');
+
+# Continue the checkpoint and wait for its completion.
+my $log_offset = -s $node->logfile;
+$node->safe_psql('postgres',
+   q{select injection_points_wakeup('create-checkpoint-run')});
+$node->wait_for_log(qr/checkpoint complete/, $log_offset);
+
+$checkpoint->quit;
+
+# Retrieve the WAL file names for the redo record and checkpoint record.
+my $redo_lsn = $node->safe_psql('postgres',
+   "SELECT redo_lsn FROM pg_control_checkpoint()");
+my $redo_walfile_name =
+  $node->safe_psql('postgres', "SELECT pg_walfile_name('$redo_lsn')");
+my $checkpoint_lsn = $node->safe_psql('postgres',
+   "SELECT checkpoint_lsn FROM pg_control_checkpoint()");
+my $checkpoint_walfile_name =
+  $node->safe_psql('postgres', "SELECT pg_walfile_name('$checkpoint_lsn')");
+
+# Redo record and checkpoint record should be on different segments.
+isnt($redo_walfile_name, $checkpoint_walfile_name,
+   'redo and checkpoint records on different segments');
+
+# Remove the WAL segment containing the redo record.
+unlink $node->data_dir . "/pg_wal/$redo_walfile_name"
+  or die "could not remove WAL file: $!";
+
+$node->stop('immediate');
+
+# Use run_log instead of node->start because this test expects that
+# the server ends with an error during recovery.
+run_log(
+   [
+       'pg_ctl',
+       '--pgdata' => $node->data_dir,
+       '--log' => $node->logfile,
+       'start',
+   ]);
+
+# Confirm that recovery has failed, as expected.
+my $logfile = slurp_file($node->logfile());
+ok( $logfile =~
+     qr/FATAL: .* could not find redo location .* referenced by checkpoint record at .*/,
+   "ends with FATAL because it could not find redo location");
+
+done_testing();
author	Michael Paquier <michael@paquier.xyz>
	Tue, 16 Dec 2025 05:28:05 +0000 (14:28 +0900)
committer	Michael Paquier <michael@paquier.xyz>
	Tue, 16 Dec 2025 05:28:05 +0000 (14:28 +0900)
src/backend/access/transam/xlog.c		patch \| blob \| blame \| history
src/backend/access/transam/xlogrecovery.c		patch \| blob \| blame \| history
src/test/recovery/meson.build		patch \| blob \| blame \| history
src/test/recovery/t/050_redo_segment_missing.pl	[new file with mode: 0644]	patch \| blob