lvmlockd: use persistent reservations for recovery with sanlock

author David Teigland <teigland@redhat.com>

Wed, 13 Aug 2025 14:57:20 +0000 (09:57 -0500)

committer David Teigland <teigland@redhat.com>

Fri, 3 Oct 2025 14:57:10 +0000 (09:57 -0500)
author David Teigland <teigland@redhat.com>
Wed, 13 Aug 2025 14:57:20 +0000 (09:57 -0500)
committer David Teigland <teigland@redhat.com>
Fri, 3 Oct 2025 14:57:10 +0000 (09:57 -0500)
diff --git a/configure.ac b/configure.ac

index d64a407595e407352333c1fac7bb5e08ef0fabde..58fc62da715afda339eb77ebd9501b486493054a 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -934,9 +934,12 @@ AC_MSG_RESULT([$BUILD_LOCKDSANLOCK])
  
  dnl -- Look for sanlock libraries
  AS_IF([test "$BUILD_LOCKDSANLOCK" = "yes"], [
-       LOCKDSANLOCK_SUPPORT=370
+# FIXME: forcing sanlock 4.2.0 for testing, default should be 370
+#      LOCKDSANLOCK_SUPPORT=370
         PKG_CHECK_EXISTS(libsanlock_client >= 4.0.0, [LOCKDSANLOCK_SUPPORT=400])
         PKG_CHECK_EXISTS(libsanlock_client >= 4.1.0, [LOCKDSANLOCK_SUPPORT=410])
+       PKG_CHECK_EXISTS(libsanlock_client >= 4.2.0, [LOCKDSANLOCK_SUPPORT=420])
+       LOCKDSANLOCK_SUPPORT=420
         PKG_CHECK_MODULES(LIBSANLOCKCLIENT, libsanlock_client >= 3.7.0, [BUILD_LVMLOCKD="yes"])
         AC_DEFINE_UNQUOTED([LOCKDSANLOCK_SUPPORT], [$LOCKDSANLOCK_SUPPORT], [Define version of sanlock.])
  ])
diff --git a/daemons/lvmlockd/Makefile.in b/daemons/lvmlockd/Makefile.in

index 7ae4b3da3e94c1cf702c6b3e0e23a31f89dc1d27..65a76510ae839fffd39ecd64e4441561aff99636 100644 (file)
--- a/daemons/lvmlockd/Makefile.in
+++ b/daemons/lvmlockd/Makefile.in
@@ -15,7 +15,7 @@ srcdir = @srcdir@
  top_srcdir = @top_srcdir@
  top_builddir = @top_builddir@
  
-SOURCES = lvmlockd-core.c
+SOURCES = lvmlockd-core.c lvmlockd-helper.c
  SOURCES2 = lvmlockctl.c
  
  TARGETS = lvmlockd lvmlockctl
diff --git a/daemons/lvmlockd/lvmlockd-client.h b/daemons/lvmlockd/lvmlockd-client.h

index 9a6f3a93982b5d5fdb7c55eafcecd4cfd4842dbd..acbb225102d93e5f500cf57f0e4e297372f7509f 100644 (file)
--- a/daemons/lvmlockd/lvmlockd-client.h
+++ b/daemons/lvmlockd/lvmlockd-client.h
@@ -60,4 +60,11 @@ static inline void lvmlockd_close(daemon_handle h)
  #define EIOTIMEOUT   225
  #define ELOCKREPAIR  226
  
+#define LOCKARGS_VERSION       0x00000001 /* meta only */
+#define LOCKARGS_LVMLOCK       0x00000002 /* meta only */
+#define LOCKARGS_TIMEOUT        0x00000004 /* user only */
+#define LOCKARGS_NOTIMEOUT      0x00000008 /* meta or user */
+#define LOCKARGS_PERSIST        0x00000010 /* meta or user */
+#define LOCKARGS_NOPERSIST      0x00000020 /* user only */
+
  #endif /* _LVM_LVMLOCKD_CLIENT_H */
diff --git a/daemons/lvmlockd/lvmlockd-core.c b/daemons/lvmlockd/lvmlockd-core.c

index d0c17802bebbc12c5ed618e497760f84e210edc3..f93c68e5461eb634df1722d60388eff59f5dc63d 100644 (file)
--- a/daemons/lvmlockd/lvmlockd-core.c
+++ b/daemons/lvmlockd/lvmlockd-core.c
@@ -26,10 +26,12 @@
  #include <syslog.h>
  #include <dirent.h>
  #include <time.h>
+#include <fcntl.h>
  #include <sys/types.h>
  #include <sys/socket.h>
  #include <sys/utsname.h>
  #include <sys/un.h>
+#include <sys/wait.h>
  
  #ifdef SD_NOTIFY_SUPPORT
  #include <systemd/sd-daemon.h>
@@ -180,6 +182,12 @@ static int listen_fd;
  static int restart_pi;
  static int restart_fds[2];
  
+static int helper_send_fd = -1; /* main loop sends requests to helper */
+static int helper_recv_fd = -1; /* main loop receives results from helper */
+static int helper_pid = -1;
+static int helper_pi = -1;
+static uint32_t helper_msg_id = 1;
+
  /*
   * Each lockspace has its own thread to do locking.
   * The lockspace thread makes synchronous lock requests to dlm/sanlock.
@@ -252,6 +260,8 @@ static int alloc_new_structs; /* used for initializing in setup_structs */
  #define DO_FORCE 1
  #define NO_FORCE 0
  
+static int add_fence_action(struct lockspace *ls, struct owner *owner);
+static int send_helper_request(struct action *act, char *ls_name, uint32_t new_msg_id);
  static int add_lock_action(struct action *act);
  static int str_to_lm(const char *str);
  static int setup_dump_socket(void);
@@ -405,6 +415,131 @@ static int dump_log(int *dump_len)
         return 0;
  }
  
+static void split_line(char *buf, int *argc, char **argv, int max_args, char sep)
+{
+       char *p = buf;
+       int i;
+
+       argv[0] = p;
+
+       for (i = 1; i < max_args; i++) {
+               p = strchr(p, sep);
+               if (!p)
+                       break;
+               *p++ = '\0';
+
+               argv[i] = p;
+       }
+       *argc = i;
+}
+
+int lockd_lockargs_get_version(char *str, unsigned int *major, unsigned int *minor, unsigned int *patch)
+{
+       char version[16] = {0};
+       char *major_str, *minor_str, *patch_str;
+       char *n, *d1, *d2;
+
+       strncpy(version, str, 15);
+
+       n = strchr(version, ':');
+       if (n)
+               *n = '\0';
+
+       d1 = strchr(version, '.');
+       if (!d1)
+               return -1;
+
+       d2 = strchr(d1 + 1, '.');
+       if (!d2)
+               return -1;
+
+       major_str = version;
+       minor_str = d1 + 1;
+       patch_str = d2 + 1;
+
+       *d1 = '\0';
+       *d2 = '\0';
+
+       if (major)
+               *major = atoi(major_str);
+       if (minor)
+               *minor = atoi(minor_str);
+       if (patch)
+               *patch = atoi(patch_str);
+
+       return 0;
+}
+
+#define MAX_LOCKARGS 8
+
+/* parse lock_args string for values that may appear in VG metadata lock_args */
+
+static int lockd_lockargs_get_meta_flags(const char *str, uint32_t *flags)
+{
+       char buf[PATH_MAX];
+       char *argv[MAX_LOCKARGS];
+       int argc;
+       int i;
+
+       if (!str)
+               return -1;
+
+       dm_strncpy(buf, str, sizeof(buf));
+
+       split_line(buf, &argc, argv, MAX_LOCKARGS, ':');
+
+       for (i = 0; i < argc; i++) {
+               if (!i && !lockd_lockargs_get_version(argv[i], NULL, NULL, NULL))
+                       *flags |= LOCKARGS_VERSION;
+               else if ((i == 1) && !strcmp(argv[i], "lvmlock"))
+                       *flags |= LOCKARGS_LVMLOCK;
+               else if (!strcmp(argv[i], "persist"))
+                       *flags |= LOCKARGS_PERSIST;
+               else if (!strcmp(argv[i], "notimeout"))
+                       *flags |= LOCKARGS_NOTIMEOUT;
+               else {
+                       log_error("Unknown lockargs meta value: %s", argv[i]);
+                       return -1;
+               }
+       }
+       log_debug("lockd_lockargs_get_meta_flags %s = 0x%x", str, *flags);
+       return 0;
+}
+
+/* parse lock_args string for values that may appear in command line --setlockargs */
+
+int lockd_lockargs_get_user_flags(const char *str, uint32_t *flags)
+{
+       char buf[PATH_MAX];
+       char *argv[MAX_LOCKARGS];
+       int argc;
+       int i;
+
+       if (!str)
+               return -1;
+
+       dm_strncpy(buf, str, sizeof(buf));
+
+       split_line(buf, &argc, argv, MAX_LOCKARGS, ',');
+
+       for (i = 0; i < argc; i++) {
+               if (!strcmp(argv[i], "persist"))
+                       *flags |= LOCKARGS_PERSIST;
+               else if (!strcmp(argv[i], "nopersist"))
+                       *flags |= LOCKARGS_NOPERSIST;
+               else if (!strcmp(argv[i], "timeout"))
+                       *flags |= LOCKARGS_TIMEOUT;
+               else if (!strcmp(argv[i], "notimeout"))
+                       *flags |= LOCKARGS_NOTIMEOUT;
+               else {
+                       log_error("Unknown lockargs option value: %s", argv[i]);
+                       return -1;
+               }
+       }
+       log_debug("lockd_lockargs_get_user_flags %s = 0x%x", str, *flags);
+       return 0;
+}
+
  struct lockspace *alloc_lockspace(void)
  {
         struct lockspace *ls;
@@ -417,6 +552,7 @@ struct lockspace *alloc_lockspace(void)
         INIT_LIST_HEAD(&ls->actions);
         INIT_LIST_HEAD(&ls->resources);
         INIT_LIST_HEAD(&ls->dispose);
+       INIT_LIST_HEAD(&ls->fence_history);
         pthread_mutex_init(&ls->mutex, NULL);
         pthread_cond_init(&ls->cond, NULL);
         return ls;
@@ -529,6 +665,7 @@ static struct resource *alloc_resource(void)
                 memset(r, 0, sizeof(struct resource) + resource_lm_data_size);
                 INIT_LIST_HEAD(&r->locks);
                 INIT_LIST_HEAD(&r->actions);
+               INIT_LIST_HEAD(&r->fence_wait_actions);
         } else {
                 log_error("out of memory for resource");
         }
@@ -586,6 +723,17 @@ static void free_client(struct client *cl)
  
  static void free_resource(struct resource *r)
  {
+       struct action *act, *act2;
+
+       list_for_each_entry_safe(act, act2, &r->actions, list) {
+               list_del(&act->list);
+               free_action(act);
+       }
+       list_for_each_entry_safe(act, act2, &r->fence_wait_actions, list) {
+               list_del(&act->list);
+               free_action(act);
+       }
+
         pthread_mutex_lock(&unused_struct_mutex);
         if (unused_resource_count >= MAX_UNUSED_RESOURCE) {
                 free(r);
@@ -808,6 +956,14 @@ static const char *op_str(int x)
                 return "busy";
         case LD_OP_REFRESH_LV:
                 return "refresh_lv";
+       case LD_OP_FENCE:
+               return "fence";
+       case LD_OP_FENCE_RESULT:
+               return "fence_result";
+       case LD_OP_SETLOCKARGS_BEFORE:
+               return "setlockargs_before";
+       case LD_OP_SETLOCKARGS_FINAL:
+               return "setlockargs_final";
         default:
                 return "op_unknown";
         };
@@ -856,45 +1012,6 @@ int last_string_from_args(char *args_in, char *last)
         return -1;
  }
  
-int version_from_args(char *args, unsigned int *major, unsigned int *minor, unsigned int *patch)
-{
-       char version[MAX_ARGS+1];
-       char *major_str, *minor_str, *patch_str;
-       char *n, *d1, *d2;
-
-       memset(version, 0, sizeof(version));
-       strncpy(version, args, MAX_ARGS);
-       version[MAX_ARGS] = '\0';
-
-       n = strstr(version, ":");
-       if (n)
-               *n = '\0';
-
-       d1 = strstr(version, ".");
-       if (!d1)
-               return -1;
-
-       d2 = strstr(d1 + 1, ".");
-       if (!d2)
-               return -1;
-
-       major_str = version;
-       minor_str = d1 + 1;
-       patch_str = d2 + 1;
-
-       *d1 = '\0';
-       *d2 = '\0';
-
-       if (major)
-               *major = atoi(major_str);
-       if (minor)
-               *minor = atoi(minor_str);
-       if (patch)
-               *patch = atoi(patch_str);
-
-       return 0;
-}
-
  /*
   * Write new info when a command exits if that command has acquired a new LV
   * lock.  If the command has released an LV lock we don't bother updating the
@@ -1915,15 +2032,24 @@ out:
   * closed/terminated their lvmlockd connection, and whose locks should
   * be released.  Do not remove these actions from act_close_list.
   *
+ * act_fence_done: list of OP_FENCE_RESULT actions, identifying hosts that
+ * have been fenced.  LOCK actions waiting for this fencing are moved from
+ * the r->fence_wait_actions list back to the r->actions list for retrying.
+ * Do not remove the FENCE_RESULT actions from act_fence_done list since
+ * these act structs are applied to multiple resources in the lockspace
+ * (like act_close_list.)
+ *
   * retry_out: set to 1 if the lock manager said we should retry,
   * meaning we should call res_process() again in a short while to retry.
   */
  
  static void res_process(struct lockspace *ls, struct resource *r,
-                       struct list_head *act_close_list, int *retry_out)
+                       struct list_head *act_close_list,
+                       struct list_head *act_fence_done,
+                       int *retry_out)
  {
         struct owner owner = { 0 };
-       struct action *act, *safe, *act_close;
+       struct action *act, *safe, *act_close, *act_fence, *act_lock;
         struct lock *lk;
         uint32_t unlock_by_client_id = 0;
         int lm_retry;
@@ -1985,6 +2111,37 @@ static void res_process(struct lockspace *ls, struct resource *r,
                 res_cancel(ls, r, act_close);
         }
  
+       if (!list_empty(&r->fence_wait_actions)) {
+               list_for_each_entry(act_fence, act_fence_done, list) {
+                       list_for_each_entry_safe(act_lock, safe, &r->fence_wait_actions, list) {
+                               /*
+                                * act_lock->owner identifies the failed host that owned the
+                                * lock which we submitted a fence request for. if a fence
+                                * result identifies that same owner, then the lock request
+                                * action can continue.
+                                */
+                               if ((act_lock->owner.host_id == act_fence->owner.host_id) &&
+                                   (act_lock->owner.generation == act_fence->owner.generation)) {
+                                       list_del(&act_lock->list);
+                                       if (act_fence->result) {
+                                               /* fencing failed, return locking error to command */
+                                               log_debug("%s:%s lock error after fence error for %u %u",
+                                                         ls->name, r->name, act_fence->owner.host_id, act_fence->owner.generation);
+                                               act_lock->result = -EAGAIN;
+                                               add_client_result(act_lock);
+                                       } else {
+                                               /* fencing done, retry lock request which should no
+                                                  longer be blocked by the failed owner */
+                                               log_debug("%s:%s lock retry after fence success for %u %u",
+                                                         ls->name, r->name, act_fence->owner.host_id, act_fence->owner.generation);
+                                               memset(&act_lock->owner, 0, sizeof(struct owner));
+                                               list_add_tail(&act_lock->list, &r->actions);
+                                       }
+                               }
+                       }
+               }
+       }
+
         /*
          * handle enable/disable
          */
@@ -2215,12 +2372,26 @@ static void res_process(struct lockspace *ls, struct resource *r,
  
                         rv = res_lock(ls, r, act, &lm_retry, &owner);
  
-                       /* TODO: if lock fails because it's owned by a failed host,
-                          and persistent reservations are enabled, then remove the
-                          pr of failed host_id, tell sanlock the host_id is now
-                          dead, and retry lock request. */
+                       /*
+                        * If lock fails because it's owned by a failed host,
+                        * and persistent reservation fencing is enabled, then
+                        * remove the pr of failed host_id, tell sanlock the
+                        * host_id is now dead, and retry lock request.
+                        */
+                       if (ls->fence_pr && (rv == -EAGAIN) &&
+                           owner.host_id && owner.generation &&
+                           !strcmp(owner.state, "FAIL")) {
+                               log_debug("%s:%s res_lock fence_pr %u:%u",
+                                         ls->name, r->name, owner.host_id, owner.generation);
+                               /* after fencing is done for owner, the act's from
+                                  r->fence_wait_actions are moved back to r->actions. */
+                               act->owner = owner;
+                               list_del(&act->list);
+                               list_add(&act->list, &r->fence_wait_actions);
+                               add_fence_action(ls, &owner);
+                               *retry_out = 1;
  
-                       if ((rv == -EAGAIN) &&
+                       } else if ((rv == -EAGAIN) &&
                             (act->retries <= act->max_retries) &&
                             (lm_retry || (r->type != LD_RT_LV))) {
                                 /* leave act on list */
@@ -2257,7 +2428,25 @@ static void res_process(struct lockspace *ls, struct resource *r,
  
                         rv = res_lock(ls, r, act, &lm_retry, &owner);
  
-                       if ((rv == -EAGAIN) &&
+                       /*
+                        * If lock fails because it's owned by a failed host,
+                        * and persistent reservation fencing is enabled, then
+                        * remove the pr of failed host_id, tell sanlock the
+                        * host_id is now dead, and retry lock request.
+                        */
+                       if (ls->fence_pr && (rv == -EAGAIN) &&
+                           owner.host_id && owner.generation &&
+                           !strcmp(owner.state, "FAIL")) {
+                               log_debug("%s:%s res_lock fence_pr %u:%u",
+                                         ls->name, r->name, owner.host_id, owner.generation);
+                               /* after fencing is done for owner, the act's from
+                                  r->fence_wait_actions are moved back to r->actions. */
+                               act->owner = owner;
+                               list_del(&act->list);
+                               list_add(&act->list, &r->fence_wait_actions);
+                               add_fence_action(ls, &owner);
+                               *retry_out = 1;
+                       } else if ((rv == -EAGAIN) &&
                             (act->retries <= act->max_retries) &&
                             (lm_retry || (r->type != LD_RT_LV))) {
                                 /* leave act on list */
@@ -2291,7 +2480,7 @@ static void res_process(struct lockspace *ls, struct resource *r,
          * processing the OP_CLOSE for the client.
          */
         if ((r->type == LD_RT_LV) && (r->mode == LD_LK_UN) &&
-           list_empty(&r->locks) && list_empty(&r->actions)) {
+           list_empty(&r->locks) && list_empty(&r->actions) && list_empty(&r->fence_wait_actions)) {
  
                 /* An implicit unlock of a transient lock. */
                 if (!unlock_by_client_id)
@@ -2573,6 +2762,7 @@ static void *lockspace_thread_main(void *arg_in)
         struct action *act_op_free = NULL;
         struct list_head tmp_act;
         struct list_head act_close;
+       struct list_head act_fence;
         char tmp_name[MAX_NAME+5];
         int fail_stop_busy;
         int free_vg = 0;
@@ -2588,6 +2778,7 @@ static void *lockspace_thread_main(void *arg_in)
         int rv;
  
         INIT_LIST_HEAD(&act_close);
+       INIT_LIST_HEAD(&act_fence);
         INIT_LIST_HEAD(&tmp_act);
  
         /* first action may be client add */
@@ -2619,8 +2810,9 @@ static void *lockspace_thread_main(void *arg_in)
                 adopt_ok = 1;
         }
  
-       log_debug("S %s lm_add_lockspace %s act %d wait %d adopt_only %d adopt_ok %d repair %d",
-                 ls->name, lm_str(ls->lm_type), add_act ? 1 : 0, wait_flag, adopt_only, adopt_ok, repair);
+       log_debug("S %s lm_add_lockspace %s act %d wait %d adopt_only %d adopt_ok %d repair %d no_timeout %d key 0x%llx",
+                 ls->name, lm_str(ls->lm_type), add_act ? 1 : 0, wait_flag, adopt_only, adopt_ok, repair, ls->no_timeout,
+                 (unsigned long long)ls->ourkey);
  
         /*
          * The prepare step does not wait for anything and is quick;
@@ -2699,6 +2891,10 @@ static void *lockspace_thread_main(void *arg_in)
  
                         act = list_first_entry(&ls->actions, struct action, list);
  
+                       log_debug("S %s ls actions entry: %s", ls->name, op_str(act->op));
+
+                       act->ls_generation = ls->generation;
+
                         if (act->op == LD_OP_KILL_VG && act->rt == LD_RT_VG) {
                                 /* Continue processing until DROP_VG arrives. */
                                 log_debug("S %s kill_vg", ls->name);
@@ -2731,12 +2927,14 @@ static void *lockspace_thread_main(void *arg_in)
                                 ls->thread_work = 0;
                                 ls->thread_stop = 1;
                                 drop_vg = 1;
+                               /* list_del(&act->list) is done at end of lockspace_thread function */
                                 break;
                         }
  
                         if (act->op == LD_OP_STOP) {
-                               /* thread_stop is already set */
                                 ls->thread_work = 0;
+                               /* ls->thread_stop = 1 is already set */
+                               /* list_del(&act->list) is done at end of lockspace_thread function */
                                 break;
                         }
  
@@ -2762,6 +2960,7 @@ static void *lockspace_thread_main(void *arg_in)
                                 ls->thread_work = 0;
                                 ls->thread_stop = 1;
                                 free_vg = 1;
+                               /* list_del(&act->list) is done at end of lockspace_thread function */
                                 break;
                         }
  
@@ -2779,6 +2978,50 @@ static void *lockspace_thread_main(void *arg_in)
                                 continue;
                         }
  
+                       if (act->op == LD_OP_SETLOCKARGS_BEFORE && act->rt == LD_RT_VG) {
+                               /* check if sanlock version supports the new args */
+                               if (!lm_setlockargs_supported_sanlock(ls, act)) {
+                                       list_del(&act->list);
+                                       act->result = -EPROTONOSUPPORT;
+                                       add_client_result(act);
+                                       continue;
+                               }
+
+                               /* check that no LV locks are held; a VG lock is usually held */
+                               if (for_each_lock(ls, LOCKS_EXIST_LV)) {
+                                       list_del(&act->list);
+                                       act->result = -ENOTEMPTY;
+                                       add_client_result(act);
+                                       continue;
+                               }
+
+                               /* check that we are the only lockspace user */
+                               rv = lm_hosts(ls, 1);
+                               if (rv) {
+                                       /*
+                                        * rv < 0: error (don't remove)
+                                        * rv > 0: other hosts in lockspace (cannot remove)
+                                        * rv = 0: only local host in lockspace (can remove)
+                                        * Checking for hosts here in addition to after the
+                                        * main loop allows vgremove to fail and be rerun
+                                        * after the ls is stopped on other hosts.
+                                        */
+                                       log_error("S %s setlockargs_before hosts %d", ls->name, rv);
+                                       list_del(&act->list);
+                                       act->result = (rv < 0) ? rv : -EBUSY;
+                                       add_client_result(act);
+                                       continue;
+                               }
+
+                               /* return success, allow the change */
+                               /* list_del act and add_client_result done after rem_lockspace */
+
+                               /* the lockspace needs to be stopped for setlockargs_final */
+                               ls->thread_work = 0;
+                               ls->thread_stop = 1;
+                               break;
+                       }
+
                         if (act->op == LD_OP_RENAME_BEFORE && act->rt == LD_RT_VG) {
                                 /* vgrename */
                                 log_debug("S %s checking for lockspace hosts", ls->name);
@@ -2792,6 +3035,7 @@ static void *lockspace_thread_main(void *arg_in)
                                 }
                                 ls->thread_work = 0;
                                 ls->thread_stop = 1;
+                               /* list_del(&act->list) is done at end of lockspace_thread function */
                                 /* Do we want to check hosts again below like vgremove? */
                                 break;
                         }
@@ -2821,6 +3065,7 @@ static void *lockspace_thread_main(void *arg_in)
                         }
  
                         if (act->op == LD_OP_FREE && act->rt == LD_RT_LV) {
+                               /* lvremove */
                                 list_del(&act->list);
  
                                 r = find_dispose_act(ls, act); /* removes r from dispose list */
@@ -2882,6 +3127,18 @@ static void *lockspace_thread_main(void *arg_in)
                                 continue;
                         }
  
+                       /*
+                        * check all resources for lock actions that are waiting
+                        * for this fence result
+                        */
+                       if (act->op == LD_OP_FENCE_RESULT) {
+                               list_del(&act->list);
+                               list_add(&act->list, &act_fence);
+                               log_debug("S %s apply fence result %d for host %u %u",
+                                         ls->name, act->result, act->owner.host_id, act->owner.generation);
+                               continue;
+                       }
+
                         /*
                          * All the other op's are for locking.
                          * Find the specific resource that the lock op is for,
@@ -2905,8 +3162,21 @@ static void *lockspace_thread_main(void *arg_in)
                         log_debug("%s:%s action %s %s", ls->name, r->name,
                                   op_str(act->op), mode_str(act->mode));
                 }
+               /* end processing ls->actions */
                 pthread_mutex_unlock(&ls->mutex);
  
+               /*
+                * If the fence result was a success, then tell the
+                * sanlock lockspace that the fenced host is dead
+                * so it will grant locks held by the fenced host.
+                */
+               if (ls->lm_type == LD_LM_SANLOCK) {
+                       list_for_each_entry(act, &act_fence, list) {
+                               if (!act->result)
+                                       lm_set_host_dead_sanlock(ls, &act->owner);
+                       }
+               }
+
                 /*
                  * Process the lock operations that have been queued for each
                  * resource.
@@ -2915,13 +3185,18 @@ static void *lockspace_thread_main(void *arg_in)
                 retry = 0;
  
                 list_for_each_entry_safe(r, r2, &ls->resources, list)
-                       res_process(ls, r, &act_close, &retry);
+                       res_process(ls, r, &act_close, &act_fence, &retry);
  
                 list_for_each_entry_safe(act, safe, &act_close, list) {
                         list_del(&act->list);
                         free_action(act);
                 }
  
+               list_for_each_entry_safe(act, safe, &act_fence, list) {
+                       list_del(&act->list);
+                       free_action(act);
+               }
+
                 if (retry) {
                         ls->thread_work = 1;
                         usleep(LOCK_RETRY_MS * 1000);
@@ -3013,12 +3288,20 @@ out_rem:
  
  out_act:
         /*
-        * Move remaining actions to results; this will usually (always?)
-        * be only the stop action.
+        * Move remaining actions to results, this will usually (always?)
+        * be the act processed above which resulted in the lockspace thread
+        * being stopped.  That act is not removed from ls->actions by
+        * the main action processing loop, but remains on ls->actions
+        * and is removed removed here.  (TODO: wouldn't it be nicer
+        * to always list_del every action above, and save a pointer
+        * to the act struct that caused thread_stop=1?  This seems
+        * to incorrectly return success for any/all acts, not just
+        * the one act that was processed leading to thread_stop.)
          */
         pthread_mutex_lock(&ls->mutex);
         list_for_each_entry_safe(act, safe, &ls->actions, list) {
                 if (act->op == LD_OP_FREE) {
+                       /* vgremove */
                         act_op_free = act;
                         act->result = 0;
                 } else if (act->op == LD_OP_STOP)
@@ -3027,6 +3310,8 @@ out_act:
                         act->result = 0;
                 else if (act->op == LD_OP_RENAME_BEFORE)
                         act->result = 0;
+               else if (act->op == LD_OP_SETLOCKARGS_BEFORE)
+                       act->result = 0;
                 else
                         act->result = -ENOLS;
                 list_del(&act->list);
@@ -3059,8 +3344,7 @@ out_act:
  
         pthread_mutex_lock(&lockspaces_mutex);
         ls->thread_done = 1;
-       ls->free_vg = free_vg;
-       ls->drop_vg = drop_vg;
+
         if (ls->lm_type == LD_LM_DLM && !strcmp(ls->name, gl_lsname_dlm))
                 global_dlm_lockspace_exists = 0;
         if (ls->lm_type == LD_LM_IDM && !strcmp(ls->name, gl_lsname_idm))
@@ -3176,14 +3460,23 @@ static int add_lockspace_thread(const char *ls_name,
         struct resource *r;
         int rv;
  
-       log_debug("add_lockspace_thread %s %s version %u",
-                 lm_str(lm_type), ls_name, act ? act->version : 0);
+       log_debug("add_lockspace_thread %s %s version %u vg_args %s",
+                 lm_str(lm_type), ls_name, act ? act->version : 0, vg_args);
  
         if (!(ls = alloc_lockspace()))
                 return -ENOMEM;
  
         strncpy(ls->name, ls_name, MAX_NAME);
         ls->lm_type = lm_type;
+       ls->ourkey = act->ourkey;
+
+       if (lockd_lockargs_get_meta_flags(vg_args, &ls->lock_args_flags) < 0) {
+               log_error("add_lockspace_thread %s lock_args invalid %s", ls->name, vg_args);
+               free(ls);
+               return -EARGS;
+       }
+       ls->no_timeout = (ls->lock_args_flags & LOCKARGS_NOTIMEOUT) ? 1 : 0;
+       ls->fence_pr = (ls->lock_args_flags & LOCKARGS_PERSIST) ? 1 : 0;
  
         if (act) {
                 ls->start_client_id = act->client_id;
@@ -3438,6 +3731,9 @@ static int add_lockspace(struct action *act)
   * unlock it when stopping.
   *
   * Should we attempt to stop the lockspace containing the gl last?
+ *
+ * FIXME: why is OP_STOP partly processed here rather than just being
+ * added to ls->actions and processed by the lockspace thread?
   */
  
  static int rem_lockspace(struct action *act)
@@ -3614,6 +3910,10 @@ static int for_each_lockspace(int do_stop, int do_free, int do_force)
                                         list_del(&act->list);
                                         free_action(act);
                                 }
+                               list_for_each_entry_safe(act, act2, &ls->fence_history, list) {
+                                       list_del(&act->list);
+                                       free_action(act);
+                               }
                                 free_ls_resources(ls);
                                 free_pvs_path(&ls->pvs);
                                 free(ls);
@@ -3701,7 +4001,8 @@ static int work_init_vg(struct action *act)
         }
  
         if (act->lm_type == LD_LM_SANLOCK)
-               rv = lm_init_vg_sanlock(ls_name, act->vg_name, act->flags, act->vg_args, act->align_mb);
+               rv = lm_init_vg_sanlock(ls_name, act->vg_name, act->flags, act->vg_args, act->align_mb,
+                                       act->other_args[0] ? act->other_args : NULL);
         else if (act->lm_type == LD_LM_DLM)
                 rv = lm_init_vg_dlm(ls_name, act->vg_name, act->flags, act->vg_args);
         else if (act->lm_type == LD_LM_IDM)
@@ -3732,6 +4033,36 @@ static int work_rename_vg(struct action *act)
         return rv;
  }
  
+static int work_setlockargs_vg_final(struct action *act)
+{
+       char ls_name[MAX_NAME+1] = {0};
+       int found;
+       int rv = -EINVAL;
+
+
+       if (act->lm_type == LD_LM_SANLOCK) {
+               vg_ls_name(act->vg_name, ls_name);
+
+               /*
+                * Wait for the lockspace thread to be cleared.
+                * It was stopped in setlockargs_before but has
+                * likely not been fully cleaned up yet.
+                */
+               while (1) {
+                       pthread_mutex_lock(&lockspaces_mutex);
+                       found = find_lockspace_name(ls_name) ? 1 : 0;
+                       pthread_mutex_unlock(&lockspaces_mutex);
+                       if (!found)
+                               break;
+                       log_debug("S %s work_setlockargs_vg_final ls not cleared, retry", ls_name);
+                       return -EAGAIN;
+               }
+               rv = lm_setlockargs_vg_sanlock(ls_name, act->vg_name, act);
+       }
+
+       return rv;
+}
+
  static void work_test_gl(void)
  {
         struct lockspace *ls;
@@ -3798,7 +4129,7 @@ static int work_init_lv(struct action *act)
         if (lm_type == LD_LM_SANLOCK) {
                 /* ls is NULL if the lockspace is not started, which happens
                    for vgchange --locktype sanlock. */
-               rv = lm_init_lv_sanlock(ls, ls_name, act->vg_name, act->lv_uuid, vg_args, lv_args, act->prev_lv_args);
+               rv = lm_init_lv_sanlock(ls, ls_name, act->vg_name, act->lv_uuid, vg_args, lv_args, act->other_args);
                 memcpy(act->lv_args, lv_args, MAX_ARGS);
                 return rv;
  
@@ -3835,6 +4166,116 @@ static int work_vg_status(struct action *act)
         return rv;
  }
  
+static void work_fence(struct action *act, int *retry)
+{
+       char ls_name[MAX_NAME+1];
+       char vg_name[MAX_NAME+1];
+       struct lockspace *ls;
+       struct action *ah;
+       struct owner ah_owner;
+       uint32_t new_msg_id;
+       int ah_result;
+       int found_busy = 0;
+       int found_done = 0;
+       int rv;
+
+       /*
+        * if the new fencing act matches a previous, completed fencing act in
+        * fence_history, then take the previous result from the previous act.
+        *
+        * if the new fencing act matches a current, in-progress fencing act in
+        * fence_history, then leave the new fencing act as a delayed work item
+        * that will be retried later.
+        */
+
+       memset(ls_name, 0, sizeof(ls_name));
+       memcpy(vg_name, act->vg_name, sizeof(act->vg_name));
+
+       pthread_mutex_lock(&lockspaces_mutex);
+       vg_ls_name(vg_name, ls_name);
+       ls = find_lockspace_name(ls_name);
+       if (!ls) {
+               pthread_mutex_unlock(&lockspaces_mutex);
+               log_error("no lockspace for fence action %s.", ls_name);
+               return;
+       }
+
+       pthread_mutex_lock(&ls->mutex);
+       list_for_each_entry(ah, &ls->fence_history, list) {
+               if (ah->owner.host_id != act->owner.host_id)
+                       continue;
+               if (ah->owner.generation != act->owner.generation)
+                       continue;
+
+               if (ah->op == LD_OP_FENCE) {
+                       /* new act matches an in-progress fence act */
+                       found_busy = 1;
+               } else if (ah->op == LD_OP_FENCE_RESULT) {
+                       /* new act matches a completed fence act */
+                       found_done = 1;
+                       ah_result = ah->result;
+                       ah_owner = ah->owner;
+               }
+               break;
+       }
+
+       if (!found_done && !found_busy) {
+               /*
+                * send the helper a fencing request for this act.
+                * keep this new act in fence_history while the helper
+                * is working on it. when it's completed, this act will
+                * be changed from OP_FENCE to OP_FENCE_RESULT and kept
+                * in fence_history.
+                */
+               list_add(&act->list, &ls->fence_history);
+               new_msg_id = helper_msg_id++;
+
+               log_debug("work_fence %s found_done %d found_busy %d send helper new_msg_id %u", vg_name, found_done, found_busy, new_msg_id);
+
+       } else if (found_done) {
+               /*
+                * A matching OP_FENCE was already completed.
+                * Reuse this act as an OP_FENCE_RESULT.
+                */
+               act->op = LD_OP_FENCE_RESULT;
+               act->result = ah_result;
+               act->owner = ah_owner;
+
+               if (!ls->thread_stop) {
+                       list_add_tail(&act->list, &ls->actions);
+                       ls->thread_work = 1;
+                       pthread_cond_signal(&ls->cond);
+               } else {
+                       free_action(act);
+               }
+
+               log_debug("work_fence %s found_done %d found_busy %d fence result %d", vg_name, found_done, found_busy, ah_result);
+
+       } else if (found_busy) {
+               /* when retried, the result will eventually be found in history above */
+               *retry = 1;
+
+               log_debug("work_fence %s found_done %d found_busy %d retry", vg_name, found_done, found_busy);
+       }
+       pthread_mutex_unlock(&ls->mutex);
+
+
+       if (!found_done && !found_busy) {
+               rv = send_helper_request(act, ls_name, new_msg_id);
+               if (rv < 0) {
+                       /* change act to FENCE_RESULT error and move it to ls->actions */
+                       log_error("work_fence %s failed to send helper request %u", vg_name, new_msg_id);
+                       pthread_mutex_lock(&ls->mutex);
+                       list_del(&act->list);
+                       act->op = LD_OP_FENCE_RESULT;
+                       act->result = -ENOTCONN;
+                       list_add_tail(&act->list, &ls->actions);
+                       pthread_mutex_unlock(&ls->mutex);
+               }
+       }
+       pthread_mutex_unlock(&lockspaces_mutex);
+}
+
  /*
   * When an action is queued for the worker_thread, it is processed right away.
   * After processing, some actions need to be retried again in a short while.
@@ -3947,6 +4388,11 @@ static void *worker_thread_main(void *arg_in)
                         act->result = work_rename_vg(act);
                         add_client_result(act);
  
+               } else if ((act->op == LD_OP_SETLOCKARGS_FINAL) && (act->rt == LD_RT_VG)) {
+                       log_debug("work setlockargs_vg_final %s", act->vg_name);
+                       act->result = work_setlockargs_vg_final(act);
+                       add_client_result(act);
+
                 } else if (act->op == LD_OP_START_WAIT) {
                         act->result = count_lockspace_starting(0);
                         if (!act->result)
@@ -3974,6 +4420,12 @@ static void *worker_thread_main(void *arg_in)
                         } else
                                 list_add(&act->list, &delayed_list);
  
+               } else if (act->op == LD_OP_FENCE) {
+                       int retry = 0;
+                       log_debug("work_fence %s %u %u", act->vg_name, act->owner.host_id, act->owner.generation);
+                       work_fence(act, &retry);
+                       if (retry)
+                               list_add(&act->list, &delayed_list);
                 } else {
                         log_error("work unknown op %d", act->op);
                         act->result = -EINVAL;
@@ -4235,10 +4687,9 @@ static int client_send_result(struct client *cl, struct action *act)
         if (act->flags & LD_AF_SH_EXISTS)
                 strcat(result_flags, "SH_EXISTS,");
  
-       if (act->op == LD_OP_INIT) {
+       if (act->op == LD_OP_INIT || act->op == LD_OP_SETLOCKARGS_FINAL) {
                 /*
-                * init is a special case where lock args need
-                * to be passed back to the client.
+                * init and setlockargs send lock_args back to the client.
                  */
                 const char *vg_args = "none";
                 const char *lv_args = "none";
@@ -4386,6 +4837,7 @@ static int client_send_result(struct client *cl, struct action *act)
                                           "op_result = " FMTd64, (int64_t) act->result,
                                           "lm_result = " FMTd64, (int64_t) act->lm_rv,
                                           "result_flags = %s", result_flags[0] ? result_flags : "none",
+                                         "ls_generation = " FMTd64, (int64_t) act->ls_generation,
                                           NULL);
         }
  
@@ -4622,6 +5074,7 @@ static int str_to_op_rt(const char *req_name, int *op, int *rt)
                 return 0;
         }
         if (!strcmp(req_name, "free_vg")) {
+               /* TODO: use LD_OP_REMOVE_VG */
                 *op = LD_OP_FREE;
                 *rt = LD_RT_VG;
                 return 0;
@@ -4632,6 +5085,7 @@ static int str_to_op_rt(const char *req_name, int *op, int *rt)
                 return 0;
         }
         if (!strcmp(req_name, "free_lv")) {
+               /* TODO: use LD_OP_REMOVE_LV */
                 *op = LD_OP_FREE;
                 *rt = LD_RT_LV;
                 return 0;
@@ -4736,6 +5190,16 @@ static int str_to_op_rt(const char *req_name, int *op, int *rt)
                 *rt = 0;
                 return 0;
         }
+       if (!strcmp(req_name, "setlockargs_vg_before")) {
+               *op = LD_OP_SETLOCKARGS_BEFORE;
+               *rt = LD_RT_VG;
+               return 0;
+       }
+       if (!strcmp(req_name, "setlockargs_vg_final")) {
+               *op = LD_OP_SETLOCKARGS_FINAL;
+               *rt = LD_RT_VG;
+               return 0;
+       }
  out:
         return -1;
  }
@@ -4913,13 +5377,15 @@ static int print_lockspace(struct lockspace *ls, const char *prefix, int pos, in
                         "vg_args=%s "
                         "lm_type=%s "
                         "host_id=%u "
+                       "generation=%llu "
                         "create_fail=%d "
                         "create_done=%d "
                         "thread_work=%d "
                         "thread_stop=%d "
                         "thread_done=%d "
                         "kill_vg=%d "
-                       "drop_vg=%d "
+                       "fence_pr=%d "
+                       "no_timeout=%d "
                         "sanlock_gl_enabled=%d\n",
                         prefix,
                         ls->name,
@@ -4928,13 +5394,15 @@ static int print_lockspace(struct lockspace *ls, const char *prefix, int pos, in
                         ls->vg_args,
                         lm_str(ls->lm_type),
                         ls->host_id,
+                       (unsigned long long)ls->generation,
                         ls->create_fail ? 1 : 0,
                         ls->create_done ? 1 : 0,
                         ls->thread_work ? 1 : 0,
                         ls->thread_stop ? 1 : 0,
                         ls->thread_done ? 1 : 0,
                         ls->kill_vg,
-                       ls->drop_vg,
+                       ls->fence_pr,
+                       ls->no_timeout,
                         ls->sanlock_gl_enabled ? 1 : 0);
  }
  
@@ -5117,6 +5585,7 @@ static void client_recv_action(struct client *cl)
         char buf[18];   /* "path[%d]\0", %d outputs signed integer so max to 10 bytes */
         int64_t val;
         uint32_t opts = 0;
+       uint64_t ourkey;
         int result = 0;
         int cl_pid;
         int op, rt, lm, mode;
@@ -5265,7 +5734,11 @@ static void client_recv_action(struct client *cl)
  
         str = daemon_request_str(req, "prev_lv_args", NULL);
         if (str && strcmp(str, "none"))
-               strncpy(act->prev_lv_args, str, MAX_ARGS);
+               strncpy(act->other_args, str, MAX_ARGS);
+
+       str = daemon_request_str(req, "set_lock_args", NULL);
+       if (str && strcmp(str, "none"))
+               strncpy(act->other_args, str, MAX_ARGS);
  
         /* start_vg will include lvmlocal.conf local/host_id here */
         val = daemon_request_int(req, "host_id", 0);
@@ -5278,6 +5751,10 @@ static void client_recv_action(struct client *cl)
  
         act->lv_size_bytes = (uint64_t)dm_config_find_int64(req.cft->root, "lv_size_bytes", 0);
  
+       ourkey = (uint64_t)dm_config_find_int64(req.cft->root, "our_key", 0);
+       if (ourkey)
+               act->ourkey = ourkey;
+
         /* Create PV list for idm */
         if (lm == LD_LM_IDM) {
                 memset(&pvs, 0x0, sizeof(pvs));
@@ -5369,6 +5846,7 @@ skip_pvs_path:
         case LD_OP_RENAME_FINAL:
         case LD_OP_RUNNING_LM:
         case LD_OP_REFRESH_LV:
+       case LD_OP_SETLOCKARGS_FINAL:
                 add_work_action(act);
                 rv = 0;
                 break;
@@ -5383,6 +5861,7 @@ skip_pvs_path:
         case LD_OP_KILL_VG:
         case LD_OP_DROP_VG:
         case LD_OP_BUSY:
+       case LD_OP_SETLOCKARGS_BEFORE:
                 rv = add_lock_action(act);
                 break;
         default:
@@ -6435,6 +6914,343 @@ static void process_restart(int fd)
                 log_debug("process_restart error %d", errno);
  }
  
+/*
+ * Fencing
+ *
+ * lockspace thread
+ * . res_process() lock action fails due to a failed host
+ * . add_fence_action() creates new action OP_FENCE with owner info
+ * . adds it to work actions
+ *
+ * worker thread
+ * . takes new OP_FENCE
+ * . compares it against lockspace's fence_history list
+ *   (completed fence actions for hosts)
+ * . if action for same host is complete, add OP_FENCE_RESULT to
+ *   actions for the lockspace thread
+ * . if action for same host is in progress, return and have worker
+ *   thread retry after delay
+ * . else send new fence command to helper process
+ *
+ * helper process
+ * . receives fencing command
+ * . runs fencing command:
+ *   lvmpersist remove --ourkey OURKEY --removekey REMKEY --vg VG
+ * . sends result back to main thread
+ *
+ * main thread
+ * . receive fencing result from helper process, process_helper
+ * . process_fence_result() finds original OP_FENCE act in
+ *   ls fence_history and changes it to OP_FENCE_RESULT
+ * . adds a new OP_FENCE_RESULT action to the lockspace actions list
+ *
+ * lockspace thread
+ * . applies OP_FENCE_RESULT to each resource's fence_wait_actions
+ * . moves matching fence_wait_actions entries to r->actions
+ *   to be retried
+ */
+
+/*
+ * We cannot block the main thread on this write, so the pipe is
+ * NONBLOCK, and write fails with EAGAIN when the pipe is full.
+ * With 1k msg size and 64k default pipe size, the pipe will be full
+ * if we quickly send 64 messages.
+ *
+ * By setting the pipe size to 1MB in setup_helper, we could quickly send 1024
+ * msgs before getting EAGAIN.
+ */
+
+static int send_helper_request(struct action *act, char *ls_name, uint32_t new_msg_id)
+{
+       struct helper_msg msg = { 0 };
+       int retries = 0;
+       int rv;
+
+       if (helper_send_fd == -1) {
+               log_error("send_helper_request no send fd");
+               return -1;
+       }
+
+       if (act->op == LD_OP_FENCE) {
+               strncpy(msg.ls_name, ls_name, MAX_NAME);
+               msg.type = HELPER_COMMAND;
+               msg.act = LD_OP_FENCE;
+               msg.msg_id = new_msg_id;
+               act->msg_id = new_msg_id;
+               snprintf(msg.command, RUN_COMMAND_LEN-1, "/usr/sbin/lvmpersist remove --ourkey 0x%llx --removekey 0x%llx --vg %s",
+                        (unsigned long long)act->ourkey,
+                        (unsigned long long)act->remkey,
+                        act->vg_name);
+               log_debug("send_helper_request fence msg %u %s", new_msg_id, msg.command);
+       } else {
+               return -1;
+       }
+
+ retry:
+       rv = write(helper_send_fd, &msg, sizeof(msg));
+       if (rv == -1 && errno == EINTR)
+               goto retry;
+
+       if (rv == -1 && errno == EAGAIN) {
+               /* pipe is full */
+               if (!retries) {
+                       retries++;
+                       sleep(1);
+                       goto retry;
+               }
+               log_error("send_helper_request write EAGAIN");
+               return -1;
+       }
+
+       /* helper exited or closed fd */
+       if (rv == -1 && errno == EPIPE) {
+               log_error("send_helper_request write EPIPE");
+               return -1;
+       }
+
+       if (rv != sizeof(msg)) {
+               /* this shouldn't happen */
+               log_error("send_helper_request write error %d %d", rv, errno);
+               return -1;
+       }
+
+       return 0;
+}
+
+/* lockspace threads call add_fence_action() */
+
+static int add_fence_action(struct lockspace *ls, struct owner *owner)
+{
+       struct action *act;
+
+       if (!(act = alloc_action()))
+               return -1;
+
+       /*
+        * The creation of a key here for host_id X generation Y must match the
+        * logic that lvm commands use to generate keys for sanlock hosts:
+        *
+        * key 0x100000YYYYYYXXXX where XXXX are the hex digits for the host_id,
+        * and YYYYYY are the hex digits for the generation number.
+        */
+
+       memcpy(act->vg_name, ls->vg_name, sizeof(act->vg_name));
+       memcpy(act->vg_uuid, ls->vg_uuid, sizeof(act->vg_uuid));
+       act->op = LD_OP_FENCE;
+       act->ourkey = ls->ourkey;
+       act->remkey = 0x1000000000000000 | ((owner->generation & 0xFFFFFF) << 16) | (owner->host_id & 0xFFFF);
+       memcpy(&act->owner, owner, sizeof(struct owner));
+
+       log_debug("add_fence_action vg %s for host_id %u gen %u ourkey 0x%llx remkey 0x%llx",
+                 act->vg_name, act->owner.host_id, act->owner.generation,
+                 (unsigned long long)act->ourkey, (unsigned long long)act->remkey);
+
+       add_work_action(act);
+       return 0;
+}
+
+static int setup_helper(void)
+{
+       int pid;
+       int pw_fd = -1; /* parent write */
+       int cr_fd = -1; /* child read */
+       int pr_fd = -1; /* parent read */
+       int cw_fd = -1; /* child write */
+       int pfd[2];
+
+       /* we can't allow the main daemon loop to block */
+       if (pipe2(pfd, O_NONBLOCK | O_CLOEXEC))
+               return -errno;
+
+       /* fcntl(pfd[1], F_SETPIPE_SZ, 1024*1024); */
+
+       cr_fd = pfd[0];
+       pw_fd = pfd[1];
+
+       if (pipe2(pfd, O_NONBLOCK | O_CLOEXEC)) {
+               close(cr_fd);
+               close(pw_fd);
+               return -errno;
+       }
+
+       pr_fd = pfd[0];
+       cw_fd = pfd[1];
+
+       pid = fork();
+       if (pid < 0) {
+               close(cr_fd);
+               close(pw_fd);
+               close(pr_fd);
+               close(cw_fd);
+               return -errno;
+       }
+
+       if (pid) {
+               close(cr_fd);
+               close(cw_fd);
+               helper_send_fd = pw_fd;
+               helper_recv_fd = pr_fd;
+               helper_pid = pid;
+               return 0;
+       } else {
+               close(pr_fd);
+               close(pw_fd);
+               helper_main(cr_fd, cw_fd, daemon_debug);
+               exit(0);
+       }
+}
+
+static void close_helper(void)
+{
+       close(helper_send_fd);
+       close(helper_recv_fd);
+       helper_send_fd = -1;
+       helper_recv_fd = -1;
+       rem_pollfd(helper_pi);
+       helper_pi = -1;
+       /* don't set helper_pid = -1 until we've tried waitpid */
+}
+
+static void helper_dead(int fd)
+{
+       int pid = helper_pid;
+       int rv, status;
+
+       close_helper();
+
+       helper_pid = -1;
+
+       rv = waitpid(pid, &status, WNOHANG);
+
+       if (rv != pid) {
+               /* should not happen */
+               log_error("helper pid %d dead wait %d", pid, rv);
+               return;
+       }
+
+       if (WIFEXITED(status)) {
+               log_error("helper pid %d exit status %d", pid,
+                         WEXITSTATUS(status));
+               return;
+       }
+
+       if (WIFSIGNALED(status)) {
+               log_error("helper pid %d term signal %d", pid,
+                         WTERMSIG(status));
+               return;
+       }
+
+       /* should not happen */
+       log_error("helper pid %d state change", pid);
+}
+
+/*
+ * main thread runs process_helper() and process_fence_result()
+ * the result is given to each lockspace as an action to process.
+ */
+
+static void process_fence_result(struct helper_msg *msg)
+{
+       struct lockspace *ls;
+       struct action *ah, *act;
+       int found = 0;
+
+       log_debug("process_fence_result %s msg_id %u result %d", msg->ls_name, msg->msg_id, msg->result);
+
+       /* create a fence result act to pass the result from ah */
+       if (!(act = alloc_action()))
+               return;
+
+       /*
+        * find the OP_FENCE action that initiated the fence request,
+        * it was saved on the fence_history list.
+        */
+       pthread_mutex_lock(&lockspaces_mutex);
+       ls = find_lockspace_name(msg->ls_name);
+       if (!ls) {
+               pthread_mutex_unlock(&lockspaces_mutex);
+               log_error("No lockspace for fence result %s", msg->ls_name);
+               free_action(act);
+               return;
+       }
+
+       pthread_mutex_lock(&ls->mutex);
+       list_for_each_entry(ah, &ls->fence_history, list) {
+               if (ah->msg_id != msg->msg_id)
+                       continue;
+
+               if (ah->op != LD_OP_FENCE) {
+                       /* shouldn't happen */
+                       log_error("process_fence_result wrong history op for msg_id %u", ah->msg_id);
+               }
+
+               /*
+                * change the OP_FENCE action into an OP_FENCE_RESULT action
+                * that is saved in the fence_history.
+                *
+                * TODO: limit history, one per host_id?
+                * e.g. remove older gen results?
+                */
+               ah->op = LD_OP_FENCE_RESULT;
+               ah->result = msg->result;
+
+               /* if the result is failure, then the lock requests
+                  waiting on this fence result will return an error */
+
+               found = 1;
+               break;
+       }
+
+       if (!found) {
+               log_error("fence result does not match a fence request");
+               goto out;
+       }
+
+       act->op = LD_OP_FENCE_RESULT;
+       act->owner = ah->owner;
+       act->result = ah->result;
+
+       if (!ls->thread_stop) {
+               list_add_tail(&act->list, &ls->actions);
+               ls->thread_work = 1;
+               pthread_cond_signal(&ls->cond);
+       } else {
+               free_action(act);
+       }
+out:
+       pthread_mutex_unlock(&ls->mutex);
+       pthread_mutex_unlock(&lockspaces_mutex);
+}
+
+static void process_helper(int fd)
+{
+       struct helper_msg msg;
+       int rv;
+
+       memset(&msg, 0, sizeof(msg));
+
+       rv = read(fd, &msg, sizeof(msg));
+       if (!rv || rv == -EAGAIN)
+               return;
+       if (rv < 0) {
+               log_error("process_helper rv %d errno %d", rv, errno);
+               goto fail;
+       }
+       if (rv != sizeof(msg)) {
+               log_error("process_helper recv size %d", rv);
+               goto fail;
+       }
+
+       if ((msg.type == HELPER_COMMAND_RESULT) && (msg.act == LD_OP_FENCE))
+               process_fence_result(&msg);
+       else
+               log_error("process_helper unknown msg %u %u %u", msg.type, msg.act, msg.msg_id);
+       return;
+
+ fail:
+       close_helper();
+}
+
  static void sigterm_handler(int sig __attribute__((unused)))
  {
         daemon_quit = 1;
@@ -6445,11 +7261,18 @@ static int main_loop(daemon_state *ds_arg)
         struct client *cl;
         int i, rv, is_recv, is_dead;
  
+       rv = setup_helper();
+       if (rv < 0) {
+               log_error("Can't setup helper process");
+               return rv;
+       }
+
         signal(SIGTERM, &sigterm_handler);
  
         rv = setup_structs();
         if (rv < 0) {
                 log_error("Can't allocate memory");
+               close_helper();
                 return rv;
         }
  
@@ -6467,6 +7290,8 @@ static int main_loop(daemon_state *ds_arg)
         listen_fd = ds_arg->socket_fd;
         listen_pi = add_pollfd(listen_fd);
  
+       helper_pi = add_pollfd(helper_recv_fd);
+
         setup_client_thread();
         setup_worker_thread();
         setup_restart();
@@ -6527,6 +7352,14 @@ static int main_loop(daemon_state *ds_arg)
                                 continue;
                         }
  
+                       if (i == helper_pi) {
+                               if (is_recv)
+                                       process_helper(pollfd[i].fd);
+                               if (is_dead)
+                                       helper_dead(pollfd[i].fd);
+                               continue;
+                       }
+
                         /*
                         log_debug("poll pi %d fd %d revents %x",
                                   i, pollfd[i].fd, pollfd[i].revents);
diff --git a/daemons/lvmlockd/lvmlockd-dlm.c b/daemons/lvmlockd/lvmlockd-dlm.c

index 9b94e17d16a85654ecf5b28f3a76776ed7e83b5f..cb4d7b2790a22dad5f570d4f1148e3bd486f4c2c 100644 (file)
--- a/daemons/lvmlockd/lvmlockd-dlm.c
+++ b/daemons/lvmlockd/lvmlockd-dlm.c
@@ -76,7 +76,7 @@ static int check_args_version(char *vg_args)
         unsigned int major = 0;
         int rv;
  
-       rv = version_from_args(vg_args, &major, NULL, NULL);
+       rv = lockd_lockargs_get_version(vg_args, &major, NULL, NULL);
         if (rv < 0) {
                 log_error("check_args_version %s error %d", vg_args, rv);
                 return rv;
diff --git a/daemons/lvmlockd/lvmlockd-helper.c b/daemons/lvmlockd/lvmlockd-helper.c

new file mode 100644 (file)

index 0000000..30542d4
--- /dev/null
+++ b/daemons/lvmlockd/lvmlockd-helper.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright 2025 Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v2 or (at your option) any later version.
+ */
+
+#include <inttypes.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <poll.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <time.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <ctype.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/prctl.h>
+#include <grp.h>
+#include <syslog.h>
+
+#include "lvmlockd-internal.h"
+
+struct list_head commands; /* helper_msg_list entries */
+
+static int _log_stderr;
+
+#define log_helper(fmt, args...) \
+do { \
+       if (_log_stderr) \
+               fprintf(stderr, fmt "\n", ##args); \
+} while (0)
+
+static void _save_command(struct helper_msg *msg)
+{
+       struct helper_msg_list *ml;
+
+       ml = malloc(sizeof(struct helper_msg_list));
+       if (!ml)
+               return;
+
+       memcpy(&ml->msg, msg, sizeof(struct helper_msg));
+       list_add_tail(&ml->list, &commands);
+}
+
+static struct helper_msg_list *_get_command(int pid)
+{
+       struct helper_msg_list *ml;
+
+       list_for_each_entry(ml, &commands, list) {
+               if (ml->msg.pid == pid)
+                       return ml;
+       }
+       return NULL;
+}
+
+static int read_msg(int fd, struct helper_msg *msg)
+{
+       int rv;
+ retry:
+       rv = read(fd, msg, sizeof(struct helper_msg));
+       if (rv == -1 && errno == EINTR)
+               goto retry;
+
+       if (rv != sizeof(struct helper_msg))
+               return -1;
+       return 0;
+}
+
+static void exec_command(char *cmd_str)
+{
+       char arg[ONE_ARG_LEN];
+       char *av[MAX_AV_COUNT + 1]; /* +1 for NULL */
+       int av_count = 0;
+       int i, arg_len, cmd_len;
+
+       for (i = 0; i < MAX_AV_COUNT + 1; i++)
+               av[i] = NULL;
+
+       if (!cmd_str[0])
+               return;
+
+       /* this should already be done, but make sure */
+       cmd_str[RUN_COMMAND_LEN - 1] = '\0';
+
+       memset(&arg, 0, sizeof(arg));
+       arg_len = 0;
+       cmd_len = strlen(cmd_str);
+
+       for (i = 0; i < cmd_len; i++) {
+               if (!cmd_str[i])
+                       break;
+
+               if (av_count == MAX_AV_COUNT)
+                       break;
+
+               if (cmd_str[i] == '\\') {
+                       if (i == (cmd_len - 1))
+                               break;
+                       i++;
+
+                       if (cmd_str[i] == '\\') {
+                               arg[arg_len++] = cmd_str[i];
+                               continue;
+                       }
+                       if (isspace(cmd_str[i])) {
+                               arg[arg_len++] = cmd_str[i];
+                               continue;
+                       } else {
+                               break;
+                       }
+               }
+
+               if (isalnum(cmd_str[i]) || ispunct(cmd_str[i])) {
+                       arg[arg_len++] = cmd_str[i];
+               } else if (isspace(cmd_str[i])) {
+                       if (arg_len)
+                               av[av_count++] = strdup(arg);
+
+                       memset(arg, 0, sizeof(arg));
+                       arg_len = 0;
+               } else {
+                       break;
+               }
+       }
+
+       if ((av_count < MAX_AV_COUNT) && arg_len) {
+               av[av_count++] = strdup(arg);
+       }
+
+       execvp(av[0], av);
+}
+
+static int send_result(struct helper_msg *msg, int fd)
+{
+       int rv;
+
+       rv = write(fd, msg, sizeof(struct helper_msg));
+
+       if (rv == sizeof(struct helper_msg))
+               return 0;
+       return -1;
+}
+
+#define IDLE_TIMEOUT_MS (30 * 1000)
+#define ACTIVE_TIMEOUT_MS 500
+
+__attribute__((noreturn)) void helper_main(int in_fd, int out_fd, int log_stderr)
+{
+       struct pollfd pollfd;
+       struct helper_msg msg;
+       struct helper_msg_list *ml;
+       siginfo_t info;
+       unsigned int fork_count = 0;
+       unsigned int done_count = 0;
+       int timeout = IDLE_TIMEOUT_MS;
+       int rv, pid;
+
+       INIT_LIST_HEAD(&commands);
+
+       _log_stderr = log_stderr;
+
+       rv = setgroups(0, NULL);
+       if (rv < 0)
+               log_helper("error clearing helper groups errno %i", errno);
+
+       memset(&pollfd, 0, sizeof(pollfd));
+       pollfd.fd = in_fd;
+       pollfd.events = POLLIN;
+
+       openlog("lvmlockd-helper", LOG_CONS | LOG_PID, LOG_LOCAL4);
+
+       while (1) {
+               rv = poll(&pollfd, 1, timeout);
+               if (rv == -1 && errno == EINTR)
+                       continue;
+
+               if (rv < 0)
+                       exit(0);
+
+               if (pollfd.revents & POLLIN) {
+                       memset(&msg, 0, sizeof(msg));
+
+                       rv = read_msg(in_fd, &msg);
+                       if (rv)
+                               continue;
+
+                       if (msg.type == HELPER_COMMAND) {
+                               pid = fork();
+                               if (!pid) {
+                                       exec_command(msg.command);
+                                       exit(1);
+                               }
+
+                               msg.pid = pid;
+
+                               _save_command(&msg);
+
+                               fork_count++;
+                       }
+               }
+
+               if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL))
+                       exit(0);
+
+               /* collect child exits until no more children exist (ECHILD)
+                  or none are ready (WNOHANG) */
+
+               while (1) {
+                       memset(&info, 0, sizeof(info));
+
+                       rv = waitid(P_ALL, 0, &info, WEXITED | WNOHANG);
+
+                       if ((rv < 0) && (errno == ECHILD)) {
+                               /*
+                               log_helper("helper no children exist fork_count %d done_count %d", fork_count, done_count);
+                               */
+                               timeout = IDLE_TIMEOUT_MS;
+                       }
+
+                       else if (!rv && !info.si_pid) {
+                               log_helper("helper no children ready fork_count %d done_count %d", fork_count, done_count);
+                               timeout = ACTIVE_TIMEOUT_MS;
+                       }
+
+                       else if (!rv && info.si_pid) {
+                               done_count++;
+
+                               if (!(ml = _get_command(info.si_pid))) {
+                                       log_helper("command for pid %d result %d not found",
+                                                 info.si_pid, info.si_status);
+                                       continue;
+                               }
+
+                               log_helper("command for pid %d result %d done", info.si_pid, info.si_status);
+
+                               ml->msg.type = HELPER_COMMAND_RESULT;
+                               ml->msg.result = info.si_status;
+
+                               send_result(&ml->msg, out_fd);
+
+                               list_del(&ml->list);
+                               free(ml);
+                               continue;
+                       }
+
+                       else {
+                               log_helper("helper waitid rv %d errno %d fork_count %d done_count %d",
+                                         rv, errno, fork_count, done_count);
+                       }
+
+                       break;
+               }
+       }
+}
diff --git a/daemons/lvmlockd/lvmlockd-internal.h b/daemons/lvmlockd/lvmlockd-internal.h

index df69824b6167bf509f8a94624bef8f17b1e3eed1..8e713558ab9af3aeb73cbd3671bd7b330bc6e2d2 100644 (file)
--- a/daemons/lvmlockd/lvmlockd-internal.h
+++ b/daemons/lvmlockd/lvmlockd-internal.h
@@ -63,6 +63,10 @@ enum {
         LD_OP_QUERY_LOCK,
         LD_OP_REFRESH_LV,
         LD_OP_VG_STATUS,
+       LD_OP_FENCE,
+       LD_OP_FENCE_RESULT,
+       LD_OP_SETLOCKARGS_BEFORE,
+       LD_OP_SETLOCKARGS_FINAL,
  };
  
  /* resource types */
@@ -119,6 +123,7 @@ struct client {
  #define LD_AF_ADOPT_ONLY           0x00200000 /* adopt orphan or fail */
  #define LD_AF_NODELAY              0x00400000
  #define LD_AF_REPAIR              0x00800000
+#define LD_AF_NO_TIMEOUT          0x01000000
  
  /*
   * Number of times to repeat a lock request after
@@ -132,6 +137,32 @@ struct pvs {
         int num;
  };
  
+#define RUN_COMMAND_LEN 1024
+#define MAX_AV_COUNT 32
+#define ONE_ARG_LEN 256
+
+/* helper_msg types */
+#define HELPER_COMMAND 0x1
+#define HELPER_COMMAND_RESULT 0x2
+
+struct helper_msg {
+       uint8_t  type;
+       uint8_t  act;
+       uint16_t unused1;
+       uint32_t msg_id;
+       int pid;
+       int result;
+       char ls_name[MAX_NAME+1];
+       uint8_t unused2;
+       uint16_t unused3;
+       char command[RUN_COMMAND_LEN];
+};
+
+struct helper_msg_list {
+       struct helper_msg msg;
+       struct list_head list;
+};
+
  #define OWNER_NAME_SIZE 64
  #define OWNER_STATE_SIZE 32
  
@@ -147,9 +178,13 @@ struct action {
         struct list_head list;
         uint32_t client_id;
         uint32_t flags;                 /* LD_AF_ */
+       uint32_t msg_id;
         uint32_t version;
         uint32_t host_id;
+       uint64_t ourkey;
+       uint64_t remkey;
         uint64_t lv_size_bytes;
+       uint64_t ls_generation;
         int8_t op;                      /* operation type LD_OP_ */
         int8_t rt;                      /* resource type LD_RT_ */
         int8_t mode;                    /* lock mode LD_LK_ */
@@ -166,7 +201,7 @@ struct action {
         char lv_uuid[MAX_NAME+1];
         char vg_args[MAX_ARGS+1];
         char lv_args[MAX_ARGS+1];
-       char prev_lv_args[MAX_ARGS+1];
+       char other_args[MAX_ARGS+1];
         struct owner owner;
         struct pvs pvs;                 /* PV list for idm */
  };
@@ -187,6 +222,7 @@ struct resource {
         unsigned int use_vb : 1;
         struct list_head locks;
         struct list_head actions;
+       struct list_head fence_wait_actions;
         char lv_args[MAX_ARGS+1];
         char lm_data[];                 /* lock manager specific data */
  };
@@ -209,8 +245,10 @@ struct lockspace {
         char vg_args[MAX_ARGS+1];       /* lock manager specific args */
         int8_t lm_type;                 /* lock manager: LM_DLM, LM_SANLOCK */
         void *lm_data;
+       uint32_t lock_args_flags;
         uint32_t host_id;
         uint64_t generation;
+       uint64_t ourkey;
         uint64_t free_lock_offset;      /* for sanlock, start search for free lock here */
         struct pvs pvs;                 /* for idm: PV list */
  
@@ -225,13 +263,14 @@ struct lockspace {
         unsigned int thread_done : 1;
         unsigned int sanlock_gl_enabled: 1;
         unsigned int sanlock_gl_dup: 1;
-       unsigned int free_vg: 1;
         unsigned int kill_vg: 1;
-       unsigned int drop_vg: 1;
+       unsigned int fence_pr: 1;
+       unsigned int no_timeout: 1;
  
         struct list_head actions;       /* new client actions */
         struct list_head resources;     /* resource/lock state for gl/vg/lv */
         struct list_head dispose;       /* resources to free */
+       struct list_head fence_history; /* internally created actions for fencing */
  };
  
  /* val_blk version */
@@ -390,7 +429,9 @@ void log_level(int level, const char *fmt, ...)  __attribute__((format(printf, 2
  struct lockspace *alloc_lockspace(void);
  int lockspaces_empty(void);
  int last_string_from_args(char *args_in, char *last);
-int version_from_args(char *args, unsigned int *major, unsigned int *minor, unsigned int *patch);
+void helper_main(int in_fd, int out_fd, int log_stderr);
+int lockd_lockargs_get_user_flags(const char *str, uint32_t *flags);
+int lockd_lockargs_get_version(char *str, unsigned int *major, unsigned int *minor, unsigned int *patch);
  
  static inline const char *mode_str(int x)
  {
@@ -559,7 +600,7 @@ static inline int lm_refresh_lv_check_dlm(struct action *act)
  
  #ifdef LOCKDSANLOCK_SUPPORT
  
-int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args, int opt_align_mb);
+int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args, int opt_align_mb, char *other_args);
  int lm_init_lv_sanlock(struct lockspace *ls, char *ls_name, char *vg_name, char *lv_name, char *vg_args, char *lv_args, char *prev_args);
  int lm_free_lv_sanlock(struct lockspace *ls, struct resource *r);
  int lm_rename_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args);
@@ -584,6 +625,9 @@ int lm_data_size_sanlock(void);
  int lm_is_running_sanlock(void);
  int lm_find_free_lock_sanlock(struct lockspace *ls, uint64_t lv_size_bytes);
  int lm_vg_status_sanlock(struct lockspace *ls, struct action *act);
+void lm_set_host_dead_sanlock(struct lockspace *ls, struct owner *owner);
+int lm_setlockargs_supported_sanlock(struct lockspace *ls, struct action *act);
+int lm_setlockargs_vg_sanlock(char *ls_name, char *vg_name, struct action *act);
  
  static inline int lm_support_sanlock(void)
  {
@@ -592,7 +636,7 @@ static inline int lm_support_sanlock(void)
  
  #else
  
-static inline int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args, int opt_align_mb)
+static inline int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args, int opt_align_mb, char *other_args)
  {
         return -1;
  }
@@ -706,6 +750,20 @@ static inline int lm_support_sanlock(void)
         return 0;
  }
  
+static inline void lm_set_host_dead_sanlock(struct lockspace *ls, struct owner *owner)
+{
+}
+
+static inline int lm_setlockargs_supported_sanlock(struct lockspace *ls, struct action *act)
+{
+       return 0;
+}
+
+static inline int lm_setlockargs_vg_sanlock(char *ls_name, char *vg_name, struct action *act)
+{
+       return -1;
+}
+
  #endif /* sanlock support */
  
  #ifdef LOCKDIDM_SUPPORT
diff --git a/daemons/lvmlockd/lvmlockd-sanlock.c b/daemons/lvmlockd/lvmlockd-sanlock.c

index 732c841874dbb9dd831402029874c76abe610a88..40b8448a5dcc8cd1af9b5bc010f01b04248d2e8a 100644 (file)
--- a/daemons/lvmlockd/lvmlockd-sanlock.c
+++ b/daemons/lvmlockd/lvmlockd-sanlock.c
@@ -28,6 +28,8 @@
  #define SANLK_ADD_NODELAY      0x00000002
  /* FIXME: copied from sanlock header until the sanlock update is more widespread */
  #define SANLK_GET_HOST_LOCAL   0x00000001
+/* FIXME: copied from sanlock header until the sanlock update is more widespread */
+#define SANLK_LSF_NO_TIMEOUT   0x00000004
  
  #include <stddef.h>
  #include <poll.h>
@@ -175,30 +177,32 @@ int lm_data_size_sanlock(void)
  }
  
  /*
- * lock_args format
- *
- * vg_lock_args format for sanlock is
- * vg_version_string:undefined:lock_lv_name
- *
- * lv_lock_args format for sanlock is
- * lv_version_string:undefined:offset
+ * If a new variant of the lock_args string cannot be
+ * handled by the previous version of lvmlockd, then the
+ * new variant should contain a larger major number.
   *
- * version_string is MAJOR.MINOR.PATCH
- * undefined may contain ":"
+ * VG_LOCK_ARGS_V1 format:
+ * 1.0.0:lvname
   *
- * If a new version of the lock_args string cannot be
- * handled by an old version of lvmlockd, then the
- * new lock_args string should contain a larger major number.
+ * VG_LOCK_ARGS_V2 format:
+ * 2.0.0:lvname:notimeout:persist
+ * 2.0.0:lvname:notimeout
+ * 2.0.0:lvname:persist
   */
  
-#define VG_LOCK_ARGS_MAJOR 1
+#define VG_LOCK_ARGS_MAJOR 2
  #define VG_LOCK_ARGS_MINOR 0
  #define VG_LOCK_ARGS_PATCH 0
  
+#define VG_LOCK_ARGS_V1 "1.0.0"
+#define VG_LOCK_ARGS_V2 "2.0.0"
+
  #define LV_LOCK_ARGS_MAJOR 1
  #define LV_LOCK_ARGS_MINOR 0
  #define LV_LOCK_ARGS_PATCH 0
  
+#define LV_LOCK_ARGS_V1 "1.0.0"
+
  /*
   * offset 0 is lockspace
   * offset align_size * 1 is unused
@@ -241,9 +245,31 @@ static void strcpy_name_len(char *buf, const char *str, size_t len)
         memccpy(buf, str, 0, len);
  }
  
-static int lock_lv_name_from_args(char *vg_args, char *lock_lv_name)
+/*
+ * copy out lvname from lock_args string:
+ * 1.0.0:lvname
+ * 2.0.0:lvname
+ * 2.0.0:lvname:other
+ */
+static int lockd_lockargs_get_locklv(char *vg_args, char *lock_lv_name)
  {
-       return last_string_from_args(vg_args, lock_lv_name);
+       char args[MAX_ARGS+1] = {0};
+       char *p, *name;
+
+       strncpy(args, vg_args, MAX_ARGS);
+
+       if (!(p = strchr(args, ':')))
+               return -1;
+
+       name = p+1;
+       if (!*name)
+               return -1;
+
+       if ((p = strchr(name, ':')))
+               *p = '\0';
+
+       strncpy(lock_lv_name, name, MAX_ARGS);
+       return 0;
  }
  
  static int lock_lv_offset_from_args(char *lv_args, uint64_t *lock_lv_offset)
@@ -269,7 +295,7 @@ static int check_args_version(char *args, unsigned int our_major)
         unsigned int major = 0;
         int rv;
  
-       rv = version_from_args(args, &major, NULL, NULL);
+       rv = lockd_lockargs_get_version(args, &major, NULL, NULL);
         if (rv < 0) {
                 log_error("check_args_version %s error %d", args, rv);
                 return rv;
@@ -333,13 +359,13 @@ out:
  }
  
  #if LOCKDSANLOCK_SUPPORT >= 410
-static int read_info_file(struct lockspace *ls, uint32_t *host_id, uint64_t *generation, int *sector_size, int *align_size)
+static int read_info_file(char *vg_name, uint32_t *host_id, uint64_t *generation, int *sector_size, int *align_size, int *no_timeout)
  {
         char line[MAX_LINE];
         char path[PATH_MAX] = { 0 };
         FILE *fp;
  
-       if (dm_snprintf(path, sizeof(path), "/var/lib/lvm/lvmlockd_info_%s", ls->vg_name) < 0)
+       if (dm_snprintf(path, sizeof(path), "/var/lib/lvm/lvmlockd_info_%s", vg_name) < 0)
                 return -1;
  
         if (!(fp = fopen(path, "r"))) {
@@ -362,11 +388,14 @@ static int read_info_file(struct lockspace *ls, uint32_t *host_id, uint64_t *gen
                 } else if (!strncmp(line, "align_size ", 11)) {
                         if (sscanf(line, "align_size %d", align_size) != 1)
                                 goto fail;
+               } else if (!strncmp(line, "no_timeout ", 11)) {
+                       if (sscanf(line, "no_timeout %d", no_timeout) != 1)
+                               goto fail;
                 }
         }
  
         _fclose(fp, path);
-       log_debug("info file: read %u %llu %d %d", *host_id, (unsigned long long)*generation, *sector_size, *align_size);
+       log_debug("info file: read %u %llu %d %d %d", *host_id, (unsigned long long)*generation, *sector_size, *align_size, *no_timeout);
         return 0;
  
  fail:
@@ -376,14 +405,13 @@ fail:
  }
  #endif
  
-static int write_info_file(struct lockspace *ls)
+static int write_info_file(char *vg_name, uint32_t host_id, uint64_t generation, int sector_size, int align_size, int no_timeout)
  {
-       struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data;
         char path[PATH_MAX] = { 0 };
         FILE *fp;
         time_t t = time(NULL);
  
-       if (dm_snprintf(path, sizeof(path), "/var/lib/lvm/lvmlockd_info_%s", ls->vg_name) < 0)
+       if (dm_snprintf(path, sizeof(path), "/var/lib/lvm/lvmlockd_info_%s", vg_name) < 0)
                 return -1;
  
         if (!(fp = fopen(path, "w"))) {
@@ -391,17 +419,18 @@ static int write_info_file(struct lockspace *ls)
                 return -1;
         }
  
-       fprintf(fp, "# vg %s %s created %s", ls->vg_name, ls->vg_uuid, ctime(&t));
-       fprintf(fp, "host_id %u\n", ls->host_id);
-       fprintf(fp, "generation %llu\n", (unsigned long long)ls->generation);
-       fprintf(fp, "sector_size %d\n", lms->sector_size);
-       fprintf(fp, "align_size %d\n", lms->align_size);
+       fprintf(fp, "# vg %s created %s", vg_name, ctime(&t));
+       fprintf(fp, "host_id %u\n", host_id);
+       fprintf(fp, "generation %llu\n", (unsigned long long)generation);
+       fprintf(fp, "sector_size %d\n", sector_size);
+       fprintf(fp, "align_size %d\n", align_size);
+       fprintf(fp, "no_timeout %d\n", no_timeout);
  
         if (fflush(fp))
                 log_warn("Failed to write/flush %s", path);
         _fclose(fp, path);
  
-       log_debug("info file: wrote %u %llu %d %d", ls->host_id, (unsigned long long)ls->generation, lms->sector_size, lms->align_size);
+       log_debug("info file: wrote %u %llu %d %d %d", host_id, (unsigned long long)generation, sector_size, align_size, no_timeout);
         return 0;
  }
  
@@ -591,7 +620,7 @@ static int _lease_corrupt_error(int rv)
     sanlock encoded this in the lockspace/resource structs on disk. */
  
  static int read_lockspace_info(char *path, uint32_t host_id, int *sector_size, int *align_size, int *align_mb,
-                              uint32_t *ss_flags, uint32_t *rs_flags, struct sanlk_host *hs)
+                              uint32_t *ss_size_flags, uint32_t *rs_size_flags, int *no_timeout, struct sanlk_host *hs)
  {
         struct sanlk_lockspace ss;
         uint32_t io_timeout = 0;
@@ -623,40 +652,43 @@ static int read_lockspace_info(char *path, uint32_t host_id, int *sector_size, i
                 *sector_size = 4096;
                 *align_mb = 8;
                 *align_size = 8 * ONE_MB;
-               *ss_flags = SANLK_LSF_SECTOR4K | SANLK_LSF_ALIGN8M;
-               *rs_flags = SANLK_RES_SECTOR4K | SANLK_RES_ALIGN8M;
+               *ss_size_flags = SANLK_LSF_SECTOR4K | SANLK_LSF_ALIGN8M;
+               *rs_size_flags = SANLK_RES_SECTOR4K | SANLK_RES_ALIGN8M;
  
         } else if ((ss.flags & SANLK_LSF_SECTOR4K) && (ss.flags & SANLK_LSF_ALIGN4M)) {
                 *sector_size = 4096;
                 *align_mb = 4;
                 *align_size = 4 * ONE_MB;
-               *ss_flags = SANLK_LSF_SECTOR4K | SANLK_LSF_ALIGN4M;
-               *rs_flags = SANLK_RES_SECTOR4K | SANLK_RES_ALIGN4M;
+               *ss_size_flags = SANLK_LSF_SECTOR4K | SANLK_LSF_ALIGN4M;
+               *rs_size_flags = SANLK_RES_SECTOR4K | SANLK_RES_ALIGN4M;
  
         } else if ((ss.flags & SANLK_LSF_SECTOR4K) && (ss.flags & SANLK_LSF_ALIGN2M)) {
                 *sector_size = 4096;
                 *align_mb = 2;
                 *align_size = 2 * ONE_MB;
-               *ss_flags = SANLK_LSF_SECTOR4K | SANLK_LSF_ALIGN2M;
-               *rs_flags = SANLK_RES_SECTOR4K | SANLK_RES_ALIGN2M;
+               *ss_size_flags = SANLK_LSF_SECTOR4K | SANLK_LSF_ALIGN2M;
+               *rs_size_flags = SANLK_RES_SECTOR4K | SANLK_RES_ALIGN2M;
  
         } else if ((ss.flags & SANLK_LSF_SECTOR4K) && (ss.flags & SANLK_LSF_ALIGN1M)) {
                 *sector_size = 4096;
                 *align_mb = 1;
                 *align_size = ONE_MB;
-               *ss_flags = SANLK_LSF_SECTOR4K | SANLK_LSF_ALIGN1M;
-               *rs_flags = SANLK_RES_SECTOR4K | SANLK_RES_ALIGN1M;
+               *ss_size_flags = SANLK_LSF_SECTOR4K | SANLK_LSF_ALIGN1M;
+               *rs_size_flags = SANLK_RES_SECTOR4K | SANLK_RES_ALIGN1M;
  
         } else if ((ss.flags & SANLK_LSF_SECTOR512) && (ss.flags & SANLK_LSF_ALIGN1M)) {
                 *sector_size = 512;
                 *align_mb = 1;
                 *align_size = ONE_MB;
-               *ss_flags = SANLK_LSF_SECTOR512 | SANLK_LSF_ALIGN1M;
-               *rs_flags = SANLK_RES_SECTOR512 | SANLK_RES_ALIGN1M;
+               *ss_size_flags = SANLK_LSF_SECTOR512 | SANLK_LSF_ALIGN1M;
+               *rs_size_flags = SANLK_RES_SECTOR512 | SANLK_RES_ALIGN1M;
         }
  
-       log_debug("read_lockspace_info %s %u found sector_size %d align_size %d",
-                 path, host_id, *sector_size, *align_size);
+       if (ss.flags & SANLK_LSF_NO_TIMEOUT)
+               *no_timeout = 1;
+
+       log_debug("read_lockspace_info %s %u found sector_size %d align_size %d no_timeout %d",
+                 path, host_id, *sector_size, *align_size, *no_timeout);
         return 0;
  }
  
@@ -670,43 +702,52 @@ static int read_lockspace_info(char *path, uint32_t host_id, int *sector_size, i
  
  #define MAX_VERSION 16
  
-int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args, int opt_align_mb)
+int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args, int opt_align_mb, char *other_args)
  {
         struct sanlk_lockspace ss;
         struct sanlk_resourced rd;
         struct sanlk_disk disk;
         char lock_lv_name[MAX_ARGS+1];
-       char lock_args_version[MAX_VERSION+1];
         const char *gl_name = NULL;
+       uint32_t lock_args_flags = 0;
         uint32_t rs_flags;
         uint32_t daemon_version;
         uint32_t daemon_proto;
         uint64_t offset;
         uint64_t dev_size;
+       int no_timeout;
+       int persist;
         int sector_size = 0;
         int align_size = 0;
         int align_mb = 0;
         int i, rv;
  
+       if (other_args && (lockd_lockargs_get_user_flags(other_args, &lock_args_flags) < 0)) {
+               log_error("S %s init_vg_san unknown other args %s", ls_name, other_args);
+               return -EARGS;
+       }
+       no_timeout = (lock_args_flags & LOCKARGS_NOTIMEOUT) ? 1 :0;
+       persist = (lock_args_flags & LOCKARGS_PERSIST) ? 1 : 0;
+
+#if LOCKDSANLOCK_SUPPORT < 420
+       if (no_timeout || persist) {
+               log_error("S %s init_vg_san sanlock 4.2 required for args %s", ls_name, other_args);
+               return -EARGS;
+       }
+#endif
+
         memset(&ss, 0, sizeof(ss));
         memset(&rd, 0, sizeof(rd));
         memset(&disk, 0, sizeof(disk));
-       memset(lock_args_version, 0, sizeof(lock_args_version));
  
         if (!vg_args || !vg_args[0] || !strcmp(vg_args, "none")) {
                 log_error("S %s init_vg_san vg_args missing", ls_name);
                 return -EARGS;
         }
  
-       snprintf(lock_args_version, MAX_VERSION, "%u.%u.%u",
-                VG_LOCK_ARGS_MAJOR, VG_LOCK_ARGS_MINOR, VG_LOCK_ARGS_PATCH);
-
         /* see comment above about input vg_args being only lock_lv_name */
         dm_strncpy(lock_lv_name, vg_args, sizeof(lock_lv_name));
  
-       if (strlen(lock_lv_name) + strlen(lock_args_version) + 2 > MAX_ARGS)
-               return -EARGS;
-
         if ((rv = build_dm_path(disk.path, SANLK_PATH_LEN, vg_name, lock_lv_name)))
                 return rv;
  
@@ -715,7 +756,7 @@ int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_ar
         if (daemon_test) {
                 if (!gl_lsname_sanlock[0])
                         strncpy(gl_lsname_sanlock, ls_name, MAX_NAME);
-               rv = snprintf(vg_args, MAX_ARGS, "%s:%s", lock_args_version, lock_lv_name);
+               rv = snprintf(vg_args, MAX_ARGS, "%s:%s", VG_LOCK_ARGS_V1, lock_lv_name);
                 if (rv >= MAX_ARGS)
                         log_debug("init_vg_san vg_args may be too long %d %s", rv, vg_args);
                 return 0;
@@ -787,6 +828,9 @@ int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_ar
                 return -EARGS;
         }
  
+       if (no_timeout)
+               ss.flags |= SANLK_LSF_NO_TIMEOUT;
+
         rv = sanlock_write_lockspace(&ss, 0, 0, sanlock_io_timeout);
         if (rv < 0) {
                 log_error("S %s init_vg_san write_lockspace error %d %s",
@@ -841,15 +885,6 @@ int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_ar
                 return rv;
         }
  
-       if (!strcmp(gl_name, R_NAME_GL))
-               dm_strncpy(gl_lsname_sanlock, ls_name, sizeof(gl_lsname_sanlock));
- 
-       rv = snprintf(vg_args, MAX_ARGS, "%s:%s", lock_args_version, lock_lv_name);
-       if (rv >= MAX_ARGS)
-               log_debug("init_vg_san vg_args may be too long %d %s", rv, vg_args);
-
-       log_debug("S %s init_vg_san done vg_args %s", ls_name, vg_args);
-
         /*
          * Go through all lv resource slots and initialize them with the
          * correct lockspace name but a special resource name that indicates
@@ -888,6 +923,25 @@ int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_ar
                 offset += align_size;
         }
  
+       if (no_timeout && persist)
+               rv = snprintf(vg_args, MAX_ARGS, "%s:%s:notimeout:persist", VG_LOCK_ARGS_V2, lock_lv_name);
+       else if (no_timeout)
+               rv = snprintf(vg_args, MAX_ARGS, "%s:%s:notimeout", VG_LOCK_ARGS_V2, lock_lv_name);
+       else if (persist)
+               rv = snprintf(vg_args, MAX_ARGS, "%s:%s:persist", VG_LOCK_ARGS_V2, lock_lv_name);
+       else
+               rv = snprintf(vg_args, MAX_ARGS, "%s:%s", VG_LOCK_ARGS_V1, lock_lv_name);
+
+       if (rv >= MAX_ARGS) {
+               log_error("S %s init_vg_san vg_args string too long %d %s", ls_name, rv, vg_args);
+               return -EINVAL;
+       }
+
+       if (!strcmp(gl_name, R_NAME_GL))
+               dm_strncpy(gl_lsname_sanlock, ls_name, sizeof(gl_lsname_sanlock));
+
+       log_debug("S %s init_vg_san done vg_args %s", ls_name, vg_args);
+
         return 0;
  }
  
@@ -905,12 +959,12 @@ int lm_init_lv_sanlock(struct lockspace *ls, char *ls_name, char *vg_name, char
         struct lm_sanlock *lms;
         struct sanlk_resourced rd;
         char lock_lv_name[MAX_ARGS+1];
-       char lock_args_version[MAX_VERSION+1];
         uint64_t offset;
         uint64_t prev_offset = 0;
         int sector_size = 0;
         int align_size = 0;
         int align_mb;
+       int no_timeout = 0;
         uint32_t ss_flags;
         uint32_t rs_flags = 0;
         uint32_t tries = 1;
@@ -918,24 +972,20 @@ int lm_init_lv_sanlock(struct lockspace *ls, char *ls_name, char *vg_name, char
  
         memset(&rd, 0, sizeof(rd));
         memset(lock_lv_name, 0, sizeof(lock_lv_name));
-       memset(lock_args_version, 0, sizeof(lock_args_version));
         memset(disk_path, 0, sizeof(disk_path));
  
-       snprintf(lock_args_version, MAX_VERSION, "%u.%u.%u",
-                LV_LOCK_ARGS_MAJOR, LV_LOCK_ARGS_MINOR, LV_LOCK_ARGS_PATCH);
-
         if (daemon_test) {
                 align_size = 1024 * 1024;
                 snprintf(lv_args, MAX_ARGS, "%s:%llu",
-                        lock_args_version,
+                        LV_LOCK_ARGS_V1,
                          (unsigned long long)((align_size * LV_LOCK_BEGIN) + (align_size * daemon_test_lv_count)));
                 daemon_test_lv_count++;
                 return 0;
         }
  
-       rv = lock_lv_name_from_args(vg_args, lock_lv_name);
+       rv = lockd_lockargs_get_locklv(vg_args, lock_lv_name);
         if (rv < 0) {
-               log_error("S %s init_lv_san lock_lv_name_from_args error %d %s",
+               log_error("S %s init_lv_san lockd_lockargs_get_locklv error %d %s",
                           ls_name, rv, vg_args);
                 return rv;
         }
@@ -957,7 +1007,7 @@ int lm_init_lv_sanlock(struct lockspace *ls, char *ls_name, char *vg_name, char
  
                 /* using host_id 1 to get sizes since we don't need host-specific info */
  
-               rv = read_lockspace_info(disk_path, 1, &sector_size, &align_size, &align_mb, &ss_flags, &rs_flags, NULL);
+               rv = read_lockspace_info(disk_path, 1, &sector_size, &align_size, &align_mb, &ss_flags, &rs_flags, &no_timeout, NULL);
                 if (rv < 0) {
                         log_error("S %s init_lv_san read_lockspace_info error %d %s",
                                   ls_name, rv, disk_path);
@@ -1025,7 +1075,7 @@ int lm_init_lv_sanlock(struct lockspace *ls, char *ls_name, char *vg_name, char
                         rv = sanlock_write_resource(&rd.rs, 0, 0, 0);
                         if (!rv) {
                                 snprintf(lv_args, MAX_ARGS, "%s:%llu",
-                                        lock_args_version, (unsigned long long)offset);
+                                        LV_LOCK_ARGS_V1, (unsigned long long)offset);
                         } else {
                                 log_error("S %s init_lv_san write error %d offset %llu",
                                           ls_name, rv, (unsigned long long)rv);
@@ -1065,9 +1115,9 @@ int lm_rename_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_
                 return -EINVAL;
         }
  
-       rv = lock_lv_name_from_args(vg_args, lock_lv_name);
+       rv = lockd_lockargs_get_locklv(vg_args, lock_lv_name);
         if (rv < 0) {
-               log_error("S %s init_lv_san lock_lv_name_from_args error %d %s",
+               log_error("S %s init_lv_san lockd_lockargs_get_locklv error %d %s",
                           ls_name, rv, vg_args);
                 return rv;
         }
@@ -1587,6 +1637,7 @@ int lm_prepare_lockspace_sanlock(struct lockspace *ls, uint64_t *prev_generation
         int sector_size = 0;
         int align_size = 0;
         int align_mb = 0;
+       int no_timeout = 0;
         int retries = 0;
         int gl_found;
         int ret, rv;
@@ -1612,9 +1663,9 @@ int lm_prepare_lockspace_sanlock(struct lockspace *ls, uint64_t *prev_generation
                 goto fail;
         }
  
-       rv = lock_lv_name_from_args(ls->vg_args, lock_lv_name);
+       rv = lockd_lockargs_get_locklv(ls->vg_args, lock_lv_name);
         if (rv < 0) {
-               log_error("S %s prepare_lockspace_san lock_lv_name_from_args error %d %s",
+               log_error("S %s prepare_lockspace_san lockd_lockargs_get_locklv error %d %s",
                           ls->name, rv, ls->vg_args);
                 ret = -EARGS;
                 goto fail;
@@ -1711,15 +1762,16 @@ int lm_prepare_lockspace_sanlock(struct lockspace *ls, uint64_t *prev_generation
  #endif
         sector_size = 0;
         align_size = 0;
+       no_timeout = 0;
  
-       rv = read_lockspace_info(disk_path, lms->ss.host_id, &sector_size, &align_size, &align_mb, &ss_flags, &rs_flags, &hs);
+       rv = read_lockspace_info(disk_path, lms->ss.host_id, &sector_size, &align_size, &align_mb, &ss_flags, &rs_flags, &no_timeout, &hs);
  
  #if LOCKDSANLOCK_SUPPORT >= 410
         if ((rv == -ELOCKREPAIR) && repair && !retries) {
                 uint64_t generation = 0;
                 uint32_t host_id = 0;
  
-               rv = read_info_file(ls, &host_id, &generation, &sector_size, &align_size);
+               rv = read_info_file(ls->vg_name, &host_id, &generation, &sector_size, &align_size, &no_timeout);
                 if (rv < 0) {
                         log_error("S %s prepare_lockspace_san cannot repair lockspace no info file", lsname);
                         ret = -EINVAL;
@@ -1750,6 +1802,9 @@ int lm_prepare_lockspace_sanlock(struct lockspace *ls, uint64_t *prev_generation
                         ret = -EINVAL;
                 }
  
+               if (no_timeout)
+                       lms->ss.flags |= SANLK_LSF_NO_TIMEOUT;
+
                 log_debug("S %s prepare_lockspace_san repair host %u lease", lsname, host_id);
  
                 rv = sanlock_init_lockspace_host(&lms->ss, NULL, generation, 0, 0, 0);
@@ -1899,7 +1954,7 @@ int lm_add_lockspace_sanlock(struct lockspace *ls, int adopt_only, int adopt_ok,
  
         free(hs);
  
-       write_info_file(ls);
+       write_info_file(ls->vg_name, ls->host_id, ls->generation, lms->sector_size, lms->align_size, ls->no_timeout);
  
         /*
          * Don't let the lockspace be cleanly released if orphan locks
@@ -2203,6 +2258,7 @@ int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
             rv == SANLK_ACQUIRE_OWNED ||
             rv == SANLK_ACQUIRE_OTHER ||
             rv == SANLK_ACQUIRE_OWNED_RETRY ||
+           rv == SANLK_ACQUIRE_OWNED_NO_TIMEOUT ||
             rv == -EAGAIN) {
  
                 /*
@@ -2231,6 +2287,9 @@ int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
                 if (rv == SANLK_ACQUIRE_OWNED_RETRY)
                         *retry = 0;
  
+               if (rv == SANLK_ACQUIRE_OWNED_NO_TIMEOUT)
+                       *retry = 0;
+
                 if (owner && owner_host.host_id) {
                         const char *host_state;
  
@@ -2421,6 +2480,7 @@ int lm_convert_sanlock(struct lockspace *ls, struct resource *r,
         case SANLK_ACQUIRE_IDLIVE:
         case SANLK_ACQUIRE_OWNED:
         case SANLK_ACQUIRE_OWNED_RETRY:
+       case SANLK_ACQUIRE_OWNED_NO_TIMEOUT:
         case SANLK_ACQUIRE_OTHER:
         case SANLK_AIO_TIMEOUT:
                 /* expected errors from known/normal cases like lock contention or io timeouts */
@@ -2729,3 +2789,181 @@ int lm_is_running_sanlock(void)
         return 1;
  }
  
+#if LOCKDSANLOCK_SUPPORT >= 420
+
+static void update_info_file(char *vg_name, int no_timeout_new)
+{
+       uint32_t host_id;
+       uint64_t generation;
+       int sector_size;
+       int align_size;
+       int no_timeout;
+       int rv;
+
+       rv = read_info_file(vg_name, &host_id, &generation, &sector_size, &align_size, &no_timeout);
+       if (rv < 0)
+               return;
+
+       write_info_file(vg_name, host_id, generation, sector_size, align_size, no_timeout_new);
+}
+
+void lm_set_host_dead_sanlock(struct lockspace *ls, struct owner *owner)
+{
+       struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data;
+       struct sanlk_host host = { 0 };
+       int rv;
+
+       log_debug("S %s set_host_dead_sanlock host_id %u gen %u", ls->name, owner->host_id, owner->generation);
+
+       host.host_id = owner->host_id;
+       host.generation = owner->generation;
+
+       rv = sanlock_set_host(&lms->ss, SANLK_SET_HOST_DEAD_EXT, 0, 0, &host);
+       if (rv)
+               log_error("S %s set_host_dead_sanlock host_id %u gen %u error %d", ls->name, owner->host_id, owner->generation, rv);
+}
+
+int lm_setlockargs_supported_sanlock(struct lockspace *ls, struct action *act)
+{
+       uint32_t daemon_version;
+       uint32_t daemon_proto;
+       uint32_t lock_args_flags = 0;
+       uint32_t ver_major, ver_minor;
+       int rv;
+
+       if (!act->other_args[0]) {
+               log_error("S %s setlockargs_supported empty user lock args", ls->name);
+               return 0;
+       }
+
+       if (lockd_lockargs_get_user_flags(act->other_args, &lock_args_flags) < 0) {
+               log_error("S %s setlockargs_supported invalid user lock args %s", ls->name, act->other_args);
+               return 0;
+       }
+
+       if (!(lock_args_flags & LOCKARGS_NOTIMEOUT) && !(lock_args_flags & LOCKARGS_PERSIST))
+               return 1;
+
+       rv = sanlock_version(0, &daemon_version, &daemon_proto);
+       if (rv < 0) {
+               log_error("S %s setlockargs failed to connect to sanlock daemon", ls->name);
+               return 0;
+       }
+
+       log_debug("S %s setlockargs sanlock version 0x%x lock_args_flags 0x%x", ls->name, daemon_version, lock_args_flags);
+
+       ver_major = (daemon_version & 0xFF000000) >> 24;
+       ver_minor = (daemon_version & 0x00FF0000) >> 16;
+
+       /* sanlock 4.2.0 added support for LOCKARGS_NOTIMEOUT or LOCKARGS_PERSIST. */
+
+       if (ver_major < 4)
+               return 0;
+
+       if ((ver_major == 4) && (ver_minor < 2))
+               return 0;
+
+       return 1;
+}
+
+int lm_setlockargs_vg_sanlock(char *ls_name, char *vg_name, struct action *act)
+{
+       struct sanlk_lockspace ss = {0};
+       char lock_lv_name[MAX_ARGS+1] = {0};
+       char disk_path[SANLK_PATH_LEN] = {0};
+       uint32_t ss_size_flags = 0;
+       uint32_t rs_size_flags = 0;
+       uint32_t lock_args_flags = 0;
+       int sector_size = 0;
+       int align_size = 0;
+       int align_mb = 0;
+       int no_timeout;
+       int persist;
+       int rv;
+
+       if (!act->other_args[0]) {
+               log_error("S %s setlockargs empty user lock args", ls_name);
+               return 0;
+       }
+
+       if (lockd_lockargs_get_user_flags(act->other_args, &lock_args_flags) < 0) {
+               log_error("S %s setlockargs invalid user lock args %s", ls_name, act->other_args);
+               return 0;
+       }
+
+       rv = lockd_lockargs_get_locklv(act->vg_args, lock_lv_name);
+       if (rv < 0) {
+               log_error("S %s setlockargs lockd_lockargs_get_locklv error %d %s",
+                         ls_name, rv, act->vg_args);
+               return rv;
+       }
+
+       if ((rv = build_dm_path(disk_path, SANLK_PATH_LEN, vg_name, lock_lv_name)))
+               return rv;
+
+       /* get the sector and align flags from host_id 1 in the current lockspace */
+
+       rv = read_lockspace_info(disk_path, 1, &sector_size, &align_size, &align_mb, &ss_size_flags, &rs_size_flags, &no_timeout, NULL);
+       if (rv < 0) {
+               log_error("S %s setlockargs read_lockspace_info error %d %s", ls_name, rv, disk_path);
+               return rv;
+       }
+
+       /* initialize lockspace */
+
+       no_timeout = (lock_args_flags & LOCKARGS_NOTIMEOUT) ? 1 :0;
+       persist = (lock_args_flags & LOCKARGS_PERSIST) ? 1 : 0;
+
+       strcpy_name_len(ss.name, ls_name, SANLK_NAME_LEN);
+       memcpy(ss.host_id_disk.path, disk_path, SANLK_PATH_LEN);
+       ss.host_id_disk.offset = 0;
+       ss.flags = ss_size_flags;
+
+       if (no_timeout)
+               ss.flags |= SANLK_LSF_NO_TIMEOUT;
+
+       log_debug("S %s setlockargs write_lockspace no_timeout %d flags 0x%x", ls_name, no_timeout, ss.flags);
+
+       rv = sanlock_write_lockspace(&ss, 0, 0, sanlock_io_timeout);
+       if (rv < 0) {
+               log_error("S %s setlockargs write_lockspace error %d %s", ls_name, rv, ss.host_id_disk.path);
+               return rv;
+       }
+
+       update_info_file(vg_name, no_timeout);
+
+       if (no_timeout && persist)
+               rv = snprintf(act->vg_args, MAX_ARGS, "%s:%s:notimeout:persist", VG_LOCK_ARGS_V2, lock_lv_name);
+       else if (no_timeout)
+               rv = snprintf(act->vg_args, MAX_ARGS, "%s:%s:notimeout", VG_LOCK_ARGS_V2, lock_lv_name);
+       else if (persist)
+               rv = snprintf(act->vg_args, MAX_ARGS, "%s:%s:persist", VG_LOCK_ARGS_V2, lock_lv_name);
+       else
+               rv = snprintf(act->vg_args, MAX_ARGS, "%s:%s", VG_LOCK_ARGS_V1, lock_lv_name);
+
+       log_debug("S %s setlockargs new args %s", ls_name, act->vg_args);
+
+       if (rv >= MAX_ARGS) {
+               log_error("S %s setlockargs vg_args string too long %d %s", ls_name, rv, act->vg_args);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+#else
+
+void lm_set_host_dead_sanlock(struct lockspace *ls, struct owner *owner)
+{
+}
+
+int lm_setlockargs_supported_sanlock(struct lockspace *ls, struct action *act)
+{
+       return 0;
+}
+
+int lm_setlockargs_vg_sanlock(char *ls_name, char *vg_name, struct action *act)
+{
+       return -EINVAL;
+}
+#endif /* LOCKDSANLOCK_SUPPORT >= 420 */
diff --git a/lib/device/persist.c b/lib/device/persist.c

index 2a65e7cfd915ae53f8e0b13e7285608d180c1079..0d041ce9e08715084f4522d04245e49b7f6bdffb 100644 (file)
--- a/lib/device/persist.c
+++ b/lib/device/persist.c
@@ -813,7 +813,7 @@ int vg_is_registered(struct cmd_context *cmd, struct volume_group *vg, uint64_t
         }
  }
  
-int persist_is_started(struct cmd_context *cmd, struct volume_group *vg, int may_fail)
+int persist_is_started(struct cmd_context *cmd, struct volume_group *vg, int may_fail, uint64_t *our_key_ret)
  {
         struct pv_list *pvl;
         struct device *dev;
@@ -826,6 +826,9 @@ int persist_is_started(struct cmd_context *cmd, struct volume_group *vg, int may
         if (!vg_is_registered(cmd, vg, &our_key_val, &partial))
                 goto out;
  
+       if (our_key_ret)
+               *our_key_ret = our_key_val;
+
         if (partial) {
                 log_debug("PR is started: partial");
                 goto out;
@@ -1093,7 +1096,7 @@ int persist_key_update(struct cmd_context *cmd, struct volume_group *vg, uint32_
  
         /*
          * When using an explicit pr_key setting, there's
-        * not sanlock generation number that needs updating.
+        * no sanlock generation number that needs updating.
          */
         if (local_key)
                 return 1;
@@ -1794,6 +1797,117 @@ static int _persist_extend_shared(struct cmd_context *cmd, struct volume_group *
         return error ? 0 : 1;
  }
  
+int persist_upgrade_stop(struct cmd_context *cmd, struct volume_group *vg, uint64_t our_key_val)
+{
+       DM_LIST_INIT(devs);
+       char our_key_buf[PR_KEY_BUF_SIZE] = { 0 };
+
+       if (!pv_list_to_dev_list(cmd->mem, &vg->pvs, &devs))
+               return_0;
+
+       if (dm_snprintf(our_key_buf, PR_KEY_BUF_SIZE-1, "0x%llx", (unsigned long long)our_key_val) < 0)
+               return_0;
+
+       if (!_run_stop(cmd, vg, &devs, our_key_buf, 0))
+               return_0;
+
+       return 1;
+}
+
+/*
+ * Host currently holds a normal sh access PR on shared VG,
+ * and wants to switch to an ex access PR on that VG
+ * (to prevent other hosts from using it while it's making
+ * changes.)
+ */
+
+int persist_upgrade_ex(struct cmd_context *cmd, struct volume_group *vg, uint64_t *our_key_held)
+{
+       DM_LIST_INIT(devs);
+       struct device_list *devl;
+       char *local_key = (char *)find_config_tree_str(cmd, local_pr_key_CFG, NULL);
+       int local_host_id = find_config_tree_int(cmd, local_host_id_CFG, NULL);
+       char our_key_buf[PR_KEY_BUF_SIZE] = { 0 };
+       char new_key_buf[PR_KEY_BUF_SIZE] = { 0 };
+       uint64_t our_key_val = 0;
+       uint64_t new_key_val = 0;
+       const char *devname;
+       const char **argv;
+       int pv_count;
+       int args;
+       int status;
+
+       if (!local_key && !local_host_id)
+               return 1;
+
+       if (!get_our_key(cmd, vg, local_key, local_host_id, our_key_buf, &our_key_val))
+               return_0;
+
+       if (!pv_list_to_dev_list(cmd->mem, &vg->pvs, &devs))
+               return_0;
+
+       log_debug("persist_upgrade_ex stop PR %s", our_key_buf);
+
+       if (!_run_stop(cmd, vg, &devs, our_key_buf, 0))
+               return_0;
+
+       if (local_key) {
+               new_key_val = our_key_val;
+               memcpy(new_key_buf, our_key_buf, PR_KEY_BUF_SIZE);
+       } else if (local_host_id) {
+               if (dm_snprintf(new_key_buf, PR_KEY_BUF_SIZE-1, "0x100000000000%04x", local_host_id) != 18) {
+                       log_error("Failed to format key string for host_id %d", local_host_id);
+                       return 0;
+               }
+               if (!parse_prkey(new_key_buf, &new_key_val)) {
+                       log_error("Failed to parse generated key %s", new_key_buf);
+                       return 0;
+               }
+       }
+
+       pv_count = dm_list_size(&devs);
+
+       log_debug("persist_upgrade_ex start PR on %d devs with local key %llx", pv_count, (unsigned long long)new_key_val);
+
+       args = 9 + pv_count*2;
+       if (vg->pr & VG_PR_PTPL)
+               args += 1;
+
+       if (!(argv = dm_pool_alloc(cmd->mem, args * sizeof(char *))))
+               return_0;
+
+       args = 0;
+       argv[0] = LVMPERSIST_PATH;
+       argv[++args] = "start";
+       argv[++args] = "--ourkey";
+       argv[++args] = new_key_buf;
+       argv[++args] = "--access";
+       argv[++args] = "ex";
+       argv[++args] = "--vg";
+       argv[++args] = vg->name;
+       if (vg->pr & VG_PR_PTPL)
+               argv[++args] = "--ptpl";
+
+       dm_list_iterate_items(devl, &devs) {
+               if (!(devname = dm_pool_strdup(cmd->mem, dev_name(devl->dev))))
+                       return_0;
+               argv[++args] = "--device";
+               argv[++args] = devname;
+       }
+
+       argv[++args] = NULL;
+
+       if (!exec_cmd(cmd, argv, &status, 1)) {
+               log_error("persistent reservation exclusive start failed: lvmpersist command error.");
+               log_error("(Use vgchange --persist stop to stop PR on other hosts.");
+               return 0;
+       }
+
+       *our_key_held = new_key_val;
+
+       return 1;
+}
+
  /*
   * Start PR on devices that are being used for vgcreate.
   * This is somewhat awkward because it happens early in
@@ -1817,6 +1931,8 @@ int persist_vgcreate_begin(struct cmd_context *cmd, char *vg_name, char *local_k
         int args;
         int status;
  
+       persist_key_file_remove_name(cmd, vg_name);
+
         if (local_key) {
                 if (!parse_prkey(local_key, &our_key_val)) {
                         log_error("Failed to parse local key %s", local_key);
@@ -1883,7 +1999,8 @@ int persist_vgcreate_begin(struct cmd_context *cmd, char *vg_name, char *local_k
   * access PR (typically WE), and starts PR with the normal sh access
   * PR (typically WEAR), allowing other hosts to also use the new VG.
   */
-int persist_vgcreate_update(struct cmd_context *cmd, struct volume_group *vg, uint32_t set_flags)
+int persist_vgcreate_update(struct cmd_context *cmd, struct volume_group *vg, uint32_t set_flags,
+                           uint64_t *our_key_ret)
  {
         DM_LIST_INIT(devs);
         struct device_list *devl;
@@ -1987,9 +2104,13 @@ int persist_vgcreate_update(struct cmd_context *cmd, struct volume_group *vg, ui
                 return 0;
         }
  
+       /* key file is an optimization, not an error condition */
         if (!write_key_file(cmd, vg, our_key_val))
                 stack;
  
+       if (our_key_ret)
+               *our_key_ret = our_key_val;
+
         return 1;
  }
  
diff --git a/lib/device/persist.h b/lib/device/persist.h

index 461f104c7e1cb12d468d206ae755d6eb90d85e56..6c534d2ccfd9a4aa487e732f331f1e9f3e15771e 100644 (file)
--- a/lib/device/persist.h
+++ b/lib/device/persist.h
@@ -62,9 +62,13 @@ int persist_start_extend(struct cmd_context *cmd, struct volume_group *vg);
  
  int persist_vgcreate_begin(struct cmd_context *cmd, char *vg_name, char *local_key, int local_host_id,
                            uint32_t set_flags, struct dm_list *devs);
-int persist_vgcreate_update(struct cmd_context *cmd, struct volume_group *vg, uint32_t set_flags);
+int persist_vgcreate_update(struct cmd_context *cmd, struct volume_group *vg, uint32_t set_flags,
+                           uint64_t *our_key_ret);
  
-int persist_is_started(struct cmd_context *cmd, struct volume_group *vg, int may_fail);
+int persist_upgrade_ex(struct cmd_context *cmd, struct volume_group *vg, uint64_t *our_key_held);
+int persist_upgrade_stop(struct cmd_context *cmd, struct volume_group *vg, uint64_t our_key_val);
+
+int persist_is_started(struct cmd_context *cmd, struct volume_group *vg, int may_fail, uint64_t *our_key);
  
  int persist_key_update(struct cmd_context *cmd, struct volume_group *vg, uint32_t prev_gen);
  
diff --git a/lib/locking/lvmlockd.c b/lib/locking/lvmlockd.c

index 11e7df477d9a5a1dc80cbbbc7285f73f5c0ae49a..c441ad6da42556e50abc1d3abd86d150fa2d6822 100644 (file)
--- a/lib/locking/lvmlockd.c
+++ b/lib/locking/lvmlockd.c
@@ -117,6 +117,62 @@ void lvmlockd_disconnect(void)
         _lvmlockd_connected = 0;
  }
  
+#define MAX_LOCKARGS 8
+
+/* parse lock_args string for values that may appear in command line --setlockargs */
+
+int lockd_lockargs_get_user_flags(const char *str, uint32_t *flags)
+{
+       char buf[PATH_MAX];
+       char *argv[MAX_LOCKARGS];
+       int argc;
+       int i;
+
+       if (!str)
+               return 0;
+ 
+       dm_strncpy(buf, str, sizeof(buf));
+
+       split_line(buf, &argc, argv, MAX_LOCKARGS, ',');
+
+       for (i = 0; i < argc; i++) {
+               if (!strcmp(argv[i], "persist"))
+                       *flags |= LOCKARGS_PERSIST;
+               else if (!strcmp(argv[i], "nopersist"))
+                       *flags |= LOCKARGS_NOPERSIST;
+               else if (!strcmp(argv[i], "timeout"))
+                       *flags |= LOCKARGS_TIMEOUT;
+               else if (!strcmp(argv[i], "notimeout"))
+                       *flags |= LOCKARGS_NOTIMEOUT;
+               else {
+                       log_error("Unknown lockargs option value: %s", argv[i]);
+                       return 0;
+               }
+       } 
+
+       if (((*flags & LOCKARGS_PERSIST) && (*flags & LOCKARGS_NOPERSIST)) ||
+           ((*flags & LOCKARGS_TIMEOUT) && (*flags & LOCKARGS_NOTIMEOUT))) {
+               log_error("Invalid setlockargs option combination: %s", str);
+               return 0;
+       }
+
+       /*
+        * . nopersist and timeout: default
+        * . persist and notimeout: permitted with setlockargs
+        *
+        * FIXME: when tested, allow
+        * . nopersist and notimeout: requires manual set host dead
+        * . persist and timeout: watchdog still resets host when PR is used
+        */
+       if (((*flags & LOCKARGS_PERSIST) && !(*flags & LOCKARGS_NOTIMEOUT)) ||
+           ((*flags & LOCKARGS_NOTIMEOUT) && !(*flags & LOCKARGS_PERSIST))) {
+               log_error("setlockargs persist and notimeout are currently required together.");
+               return 0;
+       }
+
+       return 1;
+}
+
  /* Translate the result strings from lvmlockd to bit flags. */
  static void _flags_str_to_lockd_flags(const char *flags_str, uint32_t *lockd_flags)
  {
@@ -169,7 +225,7 @@ static char *_owner_str(struct owner *owner)
  #define NO_LOCKD_RESULT (-1000)
  
  static int _lockd_result(struct cmd_context *cmd, const char *req_name, daemon_reply reply,
-                        int *result, uint32_t *lockd_flags, struct owner *owner)
+                        int *result, uint32_t *lockd_flags, struct owner *owner, uint64_t *our_generation)
  {
         int reply_result;
         const char *str;
@@ -206,6 +262,9 @@ static int _lockd_result(struct cmd_context *cmd, const char *req_name, daemon_r
                         owner->name = dm_pool_strdup(cmd->mem, str);
         }
  
+       if (our_generation)
+               *our_generation = (uint64_t)daemon_reply_int(reply, "our_generation", 0);
+
         log_debug("lockd %s result: %d", req_name, reply_result);
         return 1;
  }
@@ -420,7 +479,8 @@ static int _lockd_request(struct cmd_context *cmd,
                           const struct lvmlockd_pvs *lock_pvs,
                           int *result,
                           uint32_t *lockd_flags,
-                         struct owner *owner)
+                         struct owner *owner,
+                         uint64_t *our_generation)
  {
         const char *cmd_name = get_cmd_name();
         daemon_reply reply;
@@ -457,7 +517,7 @@ static int _lockd_request(struct cmd_context *cmd,
                                         "lv_lock_args = %s", lv_lock_args ?: "none",
                                         NULL);
  
-               if (!_lockd_result(cmd, req_name, reply, result, lockd_flags, owner))
+               if (!_lockd_result(cmd, req_name, reply, result, lockd_flags, owner, our_generation))
                         goto fail;
  
                 /*
@@ -477,7 +537,7 @@ static int _lockd_request(struct cmd_context *cmd,
                                         "vg_lock_args = %s", vg_lock_args ?: "none",
                                         NULL);
  
-               if (!_lockd_result(cmd, req_name, reply, result, lockd_flags, owner))
+               if (!_lockd_result(cmd, req_name, reply, result, lockd_flags, owner, our_generation))
                         goto fail;
  
                 /*
@@ -495,7 +555,7 @@ static int _lockd_request(struct cmd_context *cmd,
                                         "vg_lock_type = %s", vg_lock_type ?: "none",
                                         NULL);
  
-               if (!_lockd_result(cmd, req_name, reply, result, lockd_flags, owner))
+               if (!_lockd_result(cmd, req_name, reply, result, lockd_flags, owner, our_generation))
                         goto fail;
  
                 log_debug("lockd %s %s result %d %x",
@@ -766,7 +826,7 @@ static int _handle_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg)
                         "lv_size_bytes = " FMTd64, (int64_t) lv_size_bytes,
                         NULL);
  
-       if (!_lockd_result(cmd, "find_free_lock", reply, &result, NULL, NULL)) {
+       if (!_lockd_result(cmd, "find_free_lock", reply, &result, NULL, NULL, NULL)) {
                 ret = 0;
         } else {
                 ret = (result < 0) ? 0 : 1;
@@ -821,7 +881,7 @@ static int _init_vg(struct cmd_context *cmd, struct volume_group *vg,
                                 "vg_lock_type = %s", lock_type,
                                 NULL);
  
-       if (!_lockd_result(cmd, "init_vg", reply, &result, NULL, NULL)) {
+       if (!_lockd_result(cmd, "init_vg", reply, &result, NULL, NULL, NULL)) {
                 ret = 0;
                 result = -ELOCKD;
         } else {
@@ -892,7 +952,7 @@ static int _init_vg_idm(struct cmd_context *cmd, struct volume_group *vg)
         return _init_vg(cmd, vg, "idm");
  }
  
-static int _init_vg_sanlock(struct cmd_context *cmd, struct volume_group *vg, int lv_lock_count)
+static int _init_vg_sanlock(struct cmd_context *cmd, struct volume_group *vg, int lv_lock_count, const char *set_args)
  {
         daemon_reply reply;
         const char *reply_str;
@@ -908,9 +968,9 @@ static int _init_vg_sanlock(struct cmd_context *cmd, struct volume_group *vg, in
         int ret;
  
         if (!_use_lvmlockd)
-               return 0;
+               return_0;
         if (!_lvmlockd_connected)
-               return 0;
+               return_0;
  
         /*
          * We need the sector size to know what size to create the LV,
@@ -1014,11 +1074,12 @@ static int _init_vg_sanlock(struct cmd_context *cmd, struct volume_group *vg, in
                                 "vg_name = %s", vg->name,
                                 "vg_lock_type = %s", "sanlock",
                                 "vg_lock_args = %s", vg->sanlock_lv->name,
+                               "set_lock_args = %s", set_args ?: "none",
                                 "align_mb = " FMTd64, (int64_t) align_size,
                                 "opts = %s", opts ?: "none",
                                 NULL);
  
-       if (!_lockd_result(cmd, "init_vg", reply, &result, NULL, NULL)) {
+       if (!_lockd_result(cmd, "init_vg", reply, &result, NULL, NULL, NULL)) {
                 ret = 0;
                 result = -ELOCKD;
         } else {
@@ -1120,7 +1181,7 @@ static int _free_vg(struct cmd_context *cmd, struct volume_group *vg)
                                 "vg_lock_args = %s", vg->lock_args,
                                 NULL);
  
-       if (!_lockd_result(cmd, "free_vg", reply, &result, &lockd_flags, NULL)) {
+       if (!_lockd_result(cmd, "free_vg", reply, &result, &lockd_flags, NULL, NULL)) {
                 ret = 0;
         } else {
                 ret = (result < 0) ? 0 : 1;
@@ -1181,7 +1242,7 @@ int lockd_vg_is_busy(struct cmd_context *cmd, struct volume_group *vg)
                                 "vg_lock_args = %s", vg->lock_args,
                                 NULL);
  
-       if (!_lockd_result(cmd, "busy_vg", reply, &result, &lockd_flags, NULL)) {
+       if (!_lockd_result(cmd, "busy_vg", reply, &result, &lockd_flags, NULL, NULL)) {
                 ret = 1;
                 goto out;
         }
@@ -1244,7 +1305,7 @@ static int _free_vg_sanlock(struct cmd_context *cmd, struct volume_group *vg)
                                 "vg_lock_args = %s", vg->lock_args,
                                 NULL);
  
-       if (!_lockd_result(cmd, "free_vg", reply, &result, &lockd_flags, NULL)) {
+       if (!_lockd_result(cmd, "free_vg", reply, &result, &lockd_flags, NULL, NULL)) {
                 ret = 0;
         } else {
                 ret = (result < 0) ? 0 : 1;
@@ -1301,7 +1362,7 @@ static int _free_vg_sanlock(struct cmd_context *cmd, struct volume_group *vg)
  /* vgcreate */
  
  int lockd_init_vg(struct cmd_context *cmd, struct volume_group *vg,
-                 const char *lock_type, int lv_lock_count)
+                 const char *lock_type, int lv_lock_count, const char *set_args)
  {
         switch (get_lock_type_from_string(lock_type)) {
         case LOCK_TYPE_NONE:
@@ -1311,7 +1372,7 @@ int lockd_init_vg(struct cmd_context *cmd, struct volume_group *vg,
         case LOCK_TYPE_DLM:
                 return _init_vg_dlm(cmd, vg);
         case LOCK_TYPE_SANLOCK:
-               return _init_vg_sanlock(cmd, vg, lv_lock_count);
+               return _init_vg_sanlock(cmd, vg, lv_lock_count, set_args);
         case LOCK_TYPE_IDM:
                 return _init_vg_idm(cmd, vg);
         default:
@@ -1437,7 +1498,7 @@ void lockd_free_vg_final(struct cmd_context *cmd, struct volume_group *vg)
   * lock the vg, read/use/write the vg, unlock the vg.
   */
  
-int lockd_start_vg(struct cmd_context *cmd, struct volume_group *vg, int *exists)
+int lockd_start_vg(struct cmd_context *cmd, struct volume_group *vg, uint64_t our_key, int *exists)
  {
         char uuid[64] __attribute__((aligned(8)));
         const char *opts = NULL;
@@ -1515,6 +1576,7 @@ int lockd_start_vg(struct cmd_context *cmd, struct volume_group *vg, int *exists
                                 "vg_uuid = %s", uuid[0] ? uuid : "none",
                                 "version = " FMTd64, (int64_t) vg->seqno,
                                 "host_id = " FMTd64, (int64_t) host_id,
+                               "our_key = " FMTd64, (int64_t) our_key,
                                 "opts = %s", opts ?:  "none",
                                 NULL);
                 _lockd_free_pv_list(&lock_pvs);
@@ -1528,11 +1590,12 @@ int lockd_start_vg(struct cmd_context *cmd, struct volume_group *vg, int *exists
                                 "vg_uuid = %s", uuid[0] ? uuid : "none",
                                 "version = " FMTd64, (int64_t) vg->seqno,
                                 "host_id = " FMTd64, (int64_t) host_id,
+                               "our_key = " FMTd64, (int64_t) our_key,
                                 "opts = %s", opts ?:  "none",
                                 NULL);
         }
  
-       if (!_lockd_result(cmd, "start_vg", reply, &result, &lockd_flags, NULL)) {
+       if (!_lockd_result(cmd, "start_vg", reply, &result, &lockd_flags, NULL, NULL)) {
                 ret = 0;
                 result = -ELOCKD;
         } else {
@@ -1622,7 +1685,7 @@ int lockd_stop_vg(struct cmd_context *cmd, struct volume_group *vg)
                         "vg_name = %s", vg->name,
                         NULL);
  
-       if (!_lockd_result(cmd, "stop_vg", reply, &result, NULL, NULL)) {
+       if (!_lockd_result(cmd, "stop_vg", reply, &result, NULL, NULL, NULL)) {
                 ret = 0;
         } else {
                 ret = (result < 0) ? 0 : 1;
@@ -1668,7 +1731,7 @@ int lockd_start_wait(struct cmd_context *cmd)
                         "pid = " FMTd64, (int64_t) getpid(),
                         NULL);
  
-       if (!_lockd_result(cmd, "start_wait", reply, &result, NULL, NULL)) {
+       if (!_lockd_result(cmd, "start_wait", reply, &result, NULL, NULL, NULL)) {
                 ret = 0;
         } else {
                 ret = (result < 0) ? 0 : 1;
@@ -1787,7 +1850,7 @@ int lockd_global_create(struct cmd_context *cmd, const char *def_mode, const cha
   req:
         if (!_lockd_request(cmd, "lock_gl",
                               NULL, vg_lock_type, NULL, NULL, NULL, NULL, mode, NULL,
-                             NULL, &result, &lockd_flags, &owner)) {
+                             NULL, &result, &lockd_flags, &owner, NULL)) {
                 /* No result from lvmlockd, it is probably not running. */
                 log_error("Global lock failed: check that lvmlockd is running.");
                 return 0;
@@ -2051,7 +2114,7 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
  
         if (!_lockd_request(cmd, "lock_gl",
                             NULL, NULL, NULL, NULL, NULL, NULL, mode, opts,
-                           NULL, &result, &lockd_flags, &owner)) {
+                           NULL, &result, &lockd_flags, &owner, NULL)) {
                 /* No result from lvmlockd, it is probably not running. */
  
                 /* We don't care if an unlock fails. */
@@ -2288,6 +2351,7 @@ int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode,
              uint32_t flags, uint32_t *lockd_state)
  {
         struct owner owner = { 0 };
+       uint64_t our_generation = 0;
         char opt_buf[64] = {};
         const char *mode = NULL;
         const char *opts = NULL;
@@ -2402,7 +2466,7 @@ int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode,
  
         if (!_lockd_request(cmd, "lock_vg",
                               vg_name, NULL, NULL, NULL, NULL, NULL, mode, opts,
-                             NULL, &result, &lockd_flags, &owner)) {
+                             NULL, &result, &lockd_flags, &owner, &our_generation)) {
                 /*
                  * No result from lvmlockd, it is probably not running.
                  * Decide if it is ok to continue without a lock in
@@ -2615,7 +2679,7 @@ out:
          */
         if ((lockd_flags & LD_RF_DUP_GL_LS) && strcmp(mode, "un"))
                 log_warn("Duplicate sanlock global lock in VG %s", vg_name);
- 
+
         return ret;
  }
  
@@ -2660,7 +2724,7 @@ int lockd_vg_update(struct volume_group *vg)
                                 "version = " FMTd64, (int64_t) vg->seqno,
                                 NULL);
  
-       if (!_lockd_result(vg->cmd, "vg_update", reply, &result, NULL, NULL)) {
+       if (!_lockd_result(vg->cmd, "vg_update", reply, &result, NULL, NULL, NULL)) {
                 ret = 0;
         } else {
                 ret = (result < 0) ? 0 : 1;
@@ -2674,6 +2738,7 @@ int lockd_vg_is_started(struct cmd_context *cmd, struct volume_group *vg, uint32
  {
         daemon_reply reply;
         struct owner owner = { 0 };
+       uint64_t our_generation = 0;
         int result;
         int ret = 0;
  
@@ -2691,7 +2756,7 @@ int lockd_vg_is_started(struct cmd_context *cmd, struct volume_group *vg, uint32
                                 "vg_name = %s", vg->name,
                                 NULL);
  
-       if (!_lockd_result(vg->cmd, "vg_status", reply, &result, NULL, &owner)) {
+       if (!_lockd_result(vg->cmd, "vg_status", reply, &result, NULL, &owner, &our_generation)) {
                 log_debug("lockd_vg_status %s no result", vg->name);
                 goto out;
         }
@@ -2701,6 +2766,16 @@ int lockd_vg_is_started(struct cmd_context *cmd, struct volume_group *vg, uint32
                 goto out;
         }
  
+       /*
+        * The local host generation number is returned
+        * in both fields, they should always match.
+        */
+       if (our_generation && owner.generation &&
+           ((uint32_t)our_generation != owner.generation)) {
+               log_warn("WARNING: lvmlockd local host generation mismatch %llu vs %u",
+                        (unsigned long long)our_generation, owner.generation);
+       }
+
         log_debug("lockd_vg_status %s host_id %u gen %u",
                   vg->name, owner.host_id, owner.generation);
  
@@ -2734,7 +2809,7 @@ static int _query_lv(struct cmd_context *cmd, struct volume_group *vg,
                                 "lv_lock_args = %s", lock_args ?: "none",
                                 NULL);
  
-       if (!_lockd_result(cmd, "query_lock_lv", reply, &result, NULL, NULL)) {
+       if (!_lockd_result(cmd, "query_lock_lv", reply, &result, NULL, NULL, NULL)) {
                 /* No result from lvmlockd, it is probably not running. */
                 log_error("Lock query failed for LV %s/%s", vg->name, lv_name);
                 return 0;
@@ -2807,6 +2882,7 @@ int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg,
         const char *opts = NULL;
         const char *mode = NULL;
         uint32_t lockd_flags;
+       uint64_t our_generation = 0;
         int refreshed = 0;
         int result;
         struct lvmlockd_pvs lock_pvs;
@@ -2905,7 +2981,7 @@ int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg,
                 if (!_lockd_request(cmd, "lock_lv",
                                        vg->name, vg->lock_type, vg->lock_args,
                                        lv_name, lv_uuid, lock_args, mode, opts,
-                                      &lock_pvs, &result, &lockd_flags, NULL)) {
+                                      &lock_pvs, &result, &lockd_flags, NULL, NULL)) {
                         _lockd_free_pv_list(&lock_pvs);
                         /* No result from lvmlockd, it is probably not running. */
                         log_error("Locking failed for LV %s/%s", vg->name, lv_name);
@@ -2916,7 +2992,7 @@ int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg,
                 if (!_lockd_request(cmd, "lock_lv",
                                        vg->name, vg->lock_type, vg->lock_args,
                                        lv_name, lv_uuid, lock_args, mode, opts,
-                                      NULL, &result, &lockd_flags, &owner)) {
+                                      NULL, &result, &lockd_flags, &owner, &our_generation)) {
                         /* No result from lvmlockd, it is probably not running. */
                         log_error("Locking failed for LV %s/%s", vg->name, lv_name);
                         return 0;
@@ -3846,7 +3922,7 @@ static int _init_lv_sanlock(struct cmd_context *cmd, struct volume_group *vg,
                                 "vg_lock_args = %s", vg->lock_args,
                                 NULL);
  
-       if (!_lockd_result(cmd, "init_lv", reply, &result, NULL, NULL)) {
+       if (!_lockd_result(cmd, "init_lv", reply, &result, NULL, NULL, NULL)) {
                 ret = 0;
         } else {
                 ret = (result < 0) ? 0 : 1;
@@ -3921,7 +3997,7 @@ static int _free_lv(struct cmd_context *cmd, struct volume_group *vg,
                                 "lv_lock_args = %s", lock_args ?: "none",
                                 NULL);
  
-       if (!_lockd_result(cmd, "free_lv", reply, &result, NULL, NULL)) {
+       if (!_lockd_result(cmd, "free_lv", reply, &result, NULL, NULL, NULL)) {
                 ret = 0;
         } else {
                 ret = (result < 0) ? 0 : 1;
@@ -4186,7 +4262,7 @@ int lockd_rename_vg_before(struct cmd_context *cmd, struct volume_group *vg)
                         "vg_lock_args = %s", vg->lock_args,
                         NULL);
  
-       if (!_lockd_result(cmd, "rename_vg_before", reply, &result, NULL, NULL)) {
+       if (!_lockd_result(cmd, "rename_vg_before", reply, &result, NULL, NULL, NULL)) {
                 ret = 0;
         } else {
                 ret = (result < 0) ? 0 : 1;
@@ -4231,7 +4307,7 @@ int lockd_rename_vg_final(struct cmd_context *cmd, struct volume_group *vg, int
                  * Depending on the problem that caused the rename to
                  * fail, it may make sense to not restart the VG here.
                  */
-               if (!lockd_start_vg(cmd, vg, NULL))
+               if (!lockd_start_vg(cmd, vg, 0, NULL))
                         log_error("Failed to restart VG %s lockspace.", vg->name);
                 return 1;
         }
@@ -4251,7 +4327,7 @@ int lockd_rename_vg_final(struct cmd_context *cmd, struct volume_group *vg, int
                                 "vg_lock_args = %s", vg->lock_args,
                                 NULL);
  
-               if (!_lockd_result(cmd, "rename_vg_final", reply, &result, NULL, NULL)) {
+               if (!_lockd_result(cmd, "rename_vg_final", reply, &result, NULL, NULL, NULL)) {
                         ret = 0;
                 } else {
                         ret = (result < 0) ? 0 : 1;
@@ -4271,7 +4347,7 @@ int lockd_rename_vg_final(struct cmd_context *cmd, struct volume_group *vg, int
                 }
         }
  
-       if (!lockd_start_vg(cmd, vg, NULL))
+       if (!lockd_start_vg(cmd, vg, 0, NULL))
                 log_error("Failed to start VG %s lockspace.", vg->name);
  
         return 1;
@@ -4292,7 +4368,7 @@ const char *lockd_running_lock_type(struct cmd_context *cmd, int *found_multiple
                         "pid = " FMTd64, (int64_t) getpid(),
                         NULL);
  
-       if (!_lockd_result(cmd, "running_lm", reply, &result, NULL, NULL)) {
+       if (!_lockd_result(cmd, "running_lm", reply, &result, NULL, NULL, NULL)) {
                 log_error("Failed to get result from lvmlockd");
                 goto out;
         }
@@ -4413,7 +4489,7 @@ int lockd_lv_refresh(struct cmd_context *cmd, struct lvresize_params *lp)
                                 "path = %s", path,
                                 NULL);
  
-       if (!_lockd_result(cmd, "refresh_lv", reply, &result, NULL, NULL)) {
+       if (!_lockd_result(cmd, "refresh_lv", reply, &result, NULL, NULL, NULL)) {
                 /* No result from lvmlockd, it is probably not running. */
                 log_error("LV refresh failed for LV %s", path);
                 return 0;
@@ -4487,3 +4563,171 @@ void lockd_lockopt_get_flags(const char *str, uint32_t *flags)
                         log_warn("Ignoring unknown lockopt value: %s", argv[i]);
         }
  }
+
+int lockd_setlockargs(struct cmd_context *cmd, struct volume_group *vg, const char *set_args, uint64_t *our_key_held)
+{
+       daemon_reply reply;
+       const char *reply_str;
+       const char *vg_lock_args = NULL;
+       uint32_t lockd_flags = 0;
+       uint32_t lock_args_flags = 0;
+       int result;
+       int ret;
+
+       if (!_use_lvmlockd) {
+               log_error("lvmlockd is not in use.");
+               return 0;
+       }
+       if (!_lvmlockd_connected) {
+               log_error("lvmlockd is not connected.");
+               return 0;
+       }
+
+       if (!vg->lock_type || strcmp(vg->lock_type, "sanlock")) {
+               log_error("setlockargs is only supported for lock type sanlock.");
+               return 0;
+       }
+
+       if (!set_args)
+               return_0;
+
+       if (!lockd_lockargs_get_user_flags(set_args, &lock_args_flags))
+               return_0;
+
+       if ((lock_args_flags & LOCKARGS_PERSIST) && !(vg->pr & VG_PR_REQUIRE)) {
+               log_error("lockargs \"persist\" requires persistent reservation setting \"require\".");
+               return 0;
+       }
+
+       /*
+        * Check if other PR keys are registered, which would
+        * cause the persist_upgrade_ex below to fail.
+        */
+       if (vg->pr & (VG_PR_REQUIRE | VG_PR_AUTOSTART)) {
+               struct pv_list *pvl;
+               struct device *dev;
+               int key_count;
+
+               dm_list_iterate_items(pvl, &vg->pvs) {
+                       if (!(dev = pvl->pv->dev))
+                               continue;
+                       if (dm_list_empty(&dev->aliases))
+                               continue;
+                       if (!dev_find_key(cmd, dev, 0, 0, NULL, 0, NULL, 1, &key_count, NULL)) {
+                               /* Shouldn't happen if persist_is_started already passed. */
+                               log_error("No PR key found on %s.", dev_name(dev));
+                               return 0;
+                       }
+                       if (key_count != 1) {
+                               log_error("Found %d PR keys on %s, stop PR and lockspace on other hosts.", key_count, dev_name(dev));
+                               log_error("(See vgchange --lockstop --persist stop.)");
+                               return 0;
+                       }
+               }
+       }
+
+       /*
+        * setlockargs_before checks that sanlock version supports
+        * the new set_lock_args, checks that no LV locks are held,
+        * checks we are the only host in the lockspace, and stops
+        * the lockspace.
+        */
+
+       log_debug("lockd setlockargs_vg_before %s", vg->name);
+
+       reply = _lockd_send("setlockargs_vg_before",
+                               "pid = " FMTd64, (int64_t) getpid(),
+                               "vg_name = %s", vg->name,
+                               "vg_lock_type = %s", vg->lock_type,
+                               "vg_lock_args = %s", vg->lock_args,
+                               "set_lock_args = %s", set_args,
+                               NULL);
+
+       if (!_lockd_result(cmd, "setlockargs_vg_before", reply, &result, &lockd_flags, NULL, NULL)) {
+               ret = 0;
+               goto out;
+       }
+
+       if (result == -EBUSY) {
+               log_error("Lockspace for \"%s\" not stopped on other hosts", vg->name);
+               ret = 0;
+               goto out;
+       } else if (result < 0) {
+               log_error("Lockspace setlockargs error %d for \"%s\"", result, vg->name);
+               ret = 0;
+               goto out;
+       }
+
+       daemon_reply_destroy(reply);
+
+       /*
+        * When the VG has the ability to use PR, change the
+        * current PR to an exclusive mode (WE), using a key
+        * with our host_id and gen 0.  The exclusive PR protects
+        * the VG from other hosts while the locking parameters
+        * are being changed (since locking can't be used while
+        * the locking is being changed.)  The lockspace is stopped
+        * while it's being changed.  At the end of the vgchange
+        * setlockargs command, persist_ugprade_stop() releases
+        * the exclusive PR.  After this, any host can do a normal
+        * start of PR/locking using the new lockargs.
+        */
+       if (vg->pr & (VG_PR_REQUIRE | VG_PR_AUTOSTART)) {
+               if (!persist_upgrade_ex(cmd, vg, our_key_held)) {
+                       log_error("Failed to upgrade to exclusive PR.");
+                       log_error("Restart PR and locking to retry setlockargs.");
+                       return 0;
+               }
+       }
+
+       /*
+        * setlockargs_final reformats sanlock leases on the lvmlock LV.
+        * The host generation numbers will all be reset back to 0, and
+        * the PR keys containing the gen will start over from gen 1.
+        * lvmlockd returns a new lock_args string that this command
+        * writes in VG metadata.
+        */
+
+ retry_final:
+       log_debug("lockd setlockargs_vg_final %s", vg->name);
+
+       reply = _lockd_send("setlockargs_vg_final",
+                               "pid = " FMTd64, (int64_t) getpid(),
+                               "vg_name = %s", vg->name,
+                               "vg_lock_type = %s", vg->lock_type,
+                               "vg_lock_args = %s", vg->lock_args,
+                               "set_lock_args = %s", set_args,
+                               NULL);
+
+       if (!_lockd_result(cmd, "setlockargs_vg_final", reply, &result, &lockd_flags, NULL, NULL)) {
+               ret = 0;
+               goto out;
+       }
+
+       if (result == -EAGAIN) {
+               daemon_reply_destroy(reply);
+               sleep(1);
+               goto retry_final;
+       }
+
+       if (!(reply_str = daemon_reply_str(reply, "vg_lock_args", NULL))) {
+               log_error("VG %s setlockargs failed: result %d new lock_args not returned", vg->name, result);
+               ret = 0;
+               goto out;
+       }
+
+       if (!(vg_lock_args = dm_pool_strdup(cmd->mem, reply_str))) {
+               ret = 0;
+               goto out;
+       }
+
+       log_debug("lockd setlockargs_vg %s result %d new lock_args %s", vg->name, result, vg_lock_args);
+
+       vg->lock_args = vg_lock_args;
+       ret = 1;
+
+out:
+       daemon_reply_destroy(reply);
+       return ret;
+}
+
diff --git a/lib/locking/lvmlockd.h b/lib/locking/lvmlockd.h

index dc196765a8ad6024711a370a57a4de12003148bd..26ed5cd12388d7c3c0ccf199c6b52a2d48ade186 100644 (file)
--- a/lib/locking/lvmlockd.h
+++ b/lib/locking/lvmlockd.h
@@ -14,6 +14,7 @@
  #include "libdaemon/client/config-util.h"
  #include "libdaemon/client/daemon-client.h"
  #include "lib/metadata/metadata-exported.h" /* is_lockd_type() */
+#include "daemons/lvmlockd/lvmlockd-client.h"
  
  #define LOCKD_SANLOCK_LV_NAME "lvmlock"
  
@@ -66,6 +67,7 @@
  #ifdef LVMLOCKD_SUPPORT
  
  void lockd_lockopt_get_flags(const char *str, uint32_t *flags);
+int lockd_lockargs_get_user_flags(const char *str, uint32_t *flags);
  
  struct lvresize_params;
  struct lvcreate_params;
@@ -82,7 +84,8 @@ void lvmlockd_disconnect(void);
  
  /* vgcreate/vgremove use init/free */
  
-int lockd_init_vg(struct cmd_context *cmd, struct volume_group *vg, const char *lock_type, int lv_lock_count);
+int lockd_init_vg(struct cmd_context *cmd, struct volume_group *vg,
+                  const char *lock_type, int lv_lock_count, const char *set_args);
  int lockd_free_vg_before(struct cmd_context *cmd, struct volume_group *vg, int changing, int yes);
  void lockd_free_vg_final(struct cmd_context *cmd, struct volume_group *vg);
  
@@ -93,7 +96,7 @@ int lockd_rename_vg_final(struct cmd_context *cmd, struct volume_group *vg, int
  
  /* start and stop the lockspace for a vg */
  
-int lockd_start_vg(struct cmd_context *cmd, struct volume_group *vg, int *exists);
+int lockd_start_vg(struct cmd_context *cmd, struct volume_group *vg, uint64_t our_key, int *exists);
  int lockd_stop_vg(struct cmd_context *cmd, struct volume_group *vg);
  int lockd_start_wait(struct cmd_context *cmd);
  int lockd_vg_is_started(struct cmd_context *cmd, struct volume_group *vg, uint32_t *cur_gen);
@@ -142,12 +145,19 @@ void lockd_lvcreate_done(struct cmd_context *cmd, struct volume_group *vg, struc
  int lockd_lvremove_lock(struct cmd_context *cmd, struct logical_volume *lv, struct logical_volume **lv_other, int *other_unlock);
  void lockd_lvremove_done(struct cmd_context *cmd, struct logical_volume *lv, struct logical_volume *lv_other, int other_unlock);
  
+int lockd_setlockargs(struct cmd_context *cmd, struct volume_group *vg, const char *set_args, uint64_t *our_key_held);
+
  #else /* LVMLOCKD_SUPPORT */
  
  static inline void lockd_lockopt_get_flags(const char *str, uint32_t *flags)
  {
  }
  
+static inline int lockd_lockargs_get_user_flags(const char *str, uint32_t *flags)
+{
+       return 0;
+}
+
  static inline void lvmlockd_set_socket(const char *sock)
  {
  }
@@ -173,7 +183,8 @@ static inline int lvmlockd_use(void)
         return 0;
  }
  
-static inline int lockd_init_vg(struct cmd_context *cmd, struct volume_group *vg, const char *lock_type, int lv_lock_count)
+static inline int lockd_init_vg(struct cmd_context *cmd, struct volume_group *vg,
+                  const char *lock_type, int lv_lock_count, const char *set_args)
  {
         return 1;
  }
@@ -345,6 +356,11 @@ static inline int lockd_vg_is_busy(struct cmd_context *cmd, struct volume_group
         return 0;
  }
  
+static inline int lockd_setlockargs(struct cmd_context *cmd, struct volume_group *vg, const char *set_args, uint64_t *our_key_held)
+{
+       return 0;
+}
+
  #endif /* LVMLOCKD_SUPPORT */
  
  #endif /* _LVMLOCKD_H */
diff --git a/lib/metadata/metadata.c b/lib/metadata/metadata.c

index 096d26b42a70aa7a90341a2d86e96cc369c7f965..071f3806738ec089140dcf3b9fa1961c4a45984d 100644 (file)
--- a/lib/metadata/metadata.c
+++ b/lib/metadata/metadata.c
@@ -2230,16 +2230,6 @@ static int _validate_lock_args_chars(const char *lock_args)
         return r;
  }
  
-static int _validate_vg_lock_args(struct volume_group *vg)
-{
-       if (!vg->lock_args || !_validate_lock_args_chars(vg->lock_args)) {
-               log_error(INTERNAL_ERROR "VG %s has invalid lock_args chars", vg->name);
-               return 0;
-       }
-
-       return 1;
-}
-
  /*
   * For lock_type sanlock, LV lock_args are <version>:<info>
   * For lock_type dlm, LV lock_args are not used, and lock_args is
@@ -2606,8 +2596,6 @@ int vg_validate(struct volume_group *vg)
                         r = 0;
                 }
  
-               if (!_validate_vg_lock_args(vg))
-                       r = 0;
         } else {
                 if (vg->lock_args) {
                         log_error(INTERNAL_ERROR "VG %s has lock_args %s without lock_type",
@@ -5150,7 +5138,7 @@ struct volume_group *vg_read(struct cmd_context *cmd, const char *vg_name, const
         }
  
         if ((vg->pr & VG_PR_REQUIRE) && (writing || activating) && !cmd->disable_pr_required) {
-               if (!persist_is_started(cmd, vg, 0)) {
+               if (!persist_is_started(cmd, vg, 0, NULL)) {
                         failure |= FAILED_PR_REQUIRED;
                         goto_bad;
                 }
diff --git a/man/lvmlockd.8_main b/man/lvmlockd.8_main

index 621224df7822111cc9f0369c3a65e2bf5509b05e..4e75f5b0f4446d6616f22e42f97a815a368595df 100644 (file)
--- a/man/lvmlockd.8_main
+++ b/man/lvmlockd.8_main
@@ -157,6 +157,15 @@ Create a shared VG from one host (uses the running lock manager):
  .I VG
  .I devices
  .P
+Include vgcreate options to use Persistent Reservations (sanlock only):
+.br
+.B --setpersist y --setlockargs persist,notimeout
+.P
+Start Persistent Reservations (if they are used):
+.br
+.B $ vgchange --persist start
+.I VG
+.P
  Start the lockspace for the shared VG on all hosts:
  .br
  .B $ vgchange --lockstart
@@ -170,7 +179,7 @@ Regular shutdown steps:
  .br
      $ vgchange -an VG
  .br
-    $ vgchange --lockstop VG
+    $ vgchange --lockstop [--persist stop] VG
  .br
      $ stop lvmlockd and lock manager
  .br
@@ -179,7 +188,7 @@ Regular startup steps:
  .br
      $ start lvmlockd and lock manager
  .br
-    $ vgchange --lockstart VG
+    $ vgchange --lockstart [--persist start] VG
  .P
  .
  .SH SETUP DETAILS
@@ -252,6 +261,25 @@ to begin using locks (i.e. creating and joining a lockspace). Starting the
  VG may take some time, and until the start completes the VG may not be
  modified or activated. When shutting down, the lockspace is stopped with
  vgchange --lockstop VG.
+.P
+.B Persistent Reservations
+.br
+A shared VG with locktype sanlock can take advantage of Persistent
+Reservations (PR) for faster and more reliable recovery. This
+requires that all of the shared devices in the VG support PR.  Test
+if PR is supported by a device with the command:
+.br
+.B $ lvmpersist devtest --device
+.I device
+.P
+The vgcreate command options when enabling PR recovery with sanlock:
+.br
+.B $ vgcreate --shared --setpersist y --setlockargs persist,notimeout
+.P
+When enabled, PR needs to be started for the VG before locking:
+.br
+.B $ vgchange --persist start
+.I VG
  .
  .SH TOPICS
  .
@@ -310,6 +338,53 @@ $ vgs --shared
    vgfoo   1   0   0 wz--ns 992.00m 736.00m
  .fi
  .
+.SS Persistent Reservations
+.
+To enable PR-based recovery ("fencing") in an existing VG:
+.br
+.B $ vgchange --setpersist y --setlockargs persist,notimeout
+.I VG
+.P
+Changing the lock args requires the VG to be stopped on all other nodes.
+.P
+Once enabled, PR needs to be started before or with lockstart:
+.br
+.B $ vgchange --persist start
+.I VG
+.br
+.B $ vgchange --persist start --lockstart
+.I VG
+.P
+Display the VG attributes configured by setpersist and setlockargs:
+.br
+.B $ vgs -o+persist
+.I VG
+.br
+.B $ vgs -o+lockargs
+.I VG
+.P
+.B setpersist y
+.br
+With this setting, LVM requires that PR be started before
+lockstart, and any VG modifications or activations require
+that PR is started.
+.br
+.B setlockargs persist
+.br
+This lockargs setting causes lvmlockd to remove the PR key of a
+failed host when a lock request fails due to a lock owned by the
+failed host. sanlock is then permitted to grant the lock.
+.br
+.B setlockargs notimeout
+.br
+This lockargs setting causes lvmlockd to configure sanlock leases
+to not time out. Removing the PR of a failed host replaces timeouts
+as a faster mechanism for lock recovery. With timeouts disabled,
+the local watchdog is not used by sanlock for the VG lockspace.
+.P
+For more information, see
+.BR lvmpersist (8).
+.
  .SS System ID
  .br
  In contrast to a shared VG, a local VG can only be used by one host
diff --git a/tools/args.h b/tools/args.h

index 0f5cec57f122aa92842db242b7a8fc1d9baf3af8..9744098cb801219e5587c8643470bd938a30750b 100644 (file)
--- a/tools/args.h
+++ b/tools/args.h
@@ -786,6 +786,20 @@ arg(setautoactivation_ARG, '\0', "setautoactivation", bool_VAL, 0, 0,
      "If autoactivation is enabled on a VG, autoactivation can be disabled\n"
      "for individual LVs.\n")
  
+arg(setlockargs_ARG, '\0', "setlockargs", string_VAL, 0, 0,
+    "Add or remove lock_args settings for a shared VG.\n"
+    "The lock_args determine lock manager behavior for the VG.\n"
+    "These settings are only allowed for lock_type sanlock.\n"
+    "persist: use persistent reservations for lock recovery.\n"
+    "lvmlockd will preempt-abort the persistent reservation of a failed\n"
+    "lock owner so that the lock can be acquired.\n"
+    "notimeout: use locks that do not time out when the owner fails.\n"
+    "In this case, a lock owned by a failed host can only be acquired\n"
+    "using the persist feature.\n"
+    "nopersist: do not use the persist feature.\n"
+    "timeout: do not use the notimeout feature.\n"
+    "The default behavior with no settings configured is: nopersist and timeout.\n")
+
  arg(setpersist_ARG, '\0', "setpersist", string_VAL, 0, 0,
      "#vgcreate\n"
      "Set flags to control persistent reservation behavior.\n"
diff --git a/tools/command-lines.in b/tools/command-lines.in

index 778187e44a1d692928909f6bc21b98ea9d3b6aeb..37af077fa9d67c1849fddfeba0557f4408767425 100644 (file)
--- a/tools/command-lines.in
+++ b/tools/command-lines.in
@@ -1843,6 +1843,11 @@ OO: --select String, --removekey String, --majoritypvs, --force
  ID: vgchange_persist
  DESC: Perform persistent reservation commands on devices.
  
+vgchange --setlockargs String VG|Tag|Select
+OO: --select String
+ID: vgchange_setlockargs
+DESC: Set or clear lock_args flags to control lock manager behavior.
+
  vgchange --lockstart
  OO: --select String, --persist start
  OP: VG|Tag|Select ...
@@ -1856,6 +1861,7 @@ ID: vgchange_lockstop
  DESC: Stop the lockspace of a shared VG in lvmlockd.
  
  vgchange --locktype LockType VG
+OO: --setlockargs String
  ID: vgchange_locktype
  DESC: Change the lock type for a shared VG.
  
@@ -1880,7 +1886,7 @@ OO: --addtag Tag, --alloc Alloc, --autobackup Bool, --clustered Bool, --maxlogic
  --metadatasize SizeMB, --pvmetadatacopies MetadataCopiesPV, --vgmetadatacopies MetadataCopiesVG,
  --reportformat ReportFmt, --dataalignment SizeKB, --dataalignmentoffset SizeKB,
  --shared, --systemid String, --locktype LockType, --setautoactivation Bool,
---setpersist String, --persist start
+--setpersist String, --persist start, --setlockargs String
  ID: vgcreate_general
  
  ---
diff --git a/tools/lvmcmdline.c b/tools/lvmcmdline.c

index 4fce42341bb9e28e1c1f5c2906ad2fdd532c944b..1ed76f6e022dec88f2f6e35a1024058045e08a84 100644 (file)
--- a/tools/lvmcmdline.c
+++ b/tools/lvmcmdline.c
@@ -90,6 +90,7 @@ static const struct command_function _command_functions[CMD_COUNT] = {
         { vgchange_systemid_CMD, vgchange_systemid_cmd },
         { vgchange_setpersist_CMD, vgchange_setpersist_cmd },
         { vgchange_persist_CMD, vgchange_persist_cmd },
+       { vgchange_setlockargs_CMD, vgchange_setlockargs_cmd },
  
         /* lvdisplay variants */
         { lvdisplay_columns_CMD,        lvdisplay_columns_cmd },
diff --git a/tools/toollib.c b/tools/toollib.c

index b85c6cd250ef7e299ca73b221e0171a043dbd97f..62781047711c4f028e3f057f15ba05430e3c3fd0 100644 (file)
--- a/tools/toollib.c
+++ b/tools/toollib.c
@@ -521,7 +521,8 @@ int vgcreate_params_set_defaults(struct cmd_context *cmd,
   */
  int vgcreate_params_set_from_args(struct cmd_context *cmd,
                                   struct vgcreate_params *vp_new,
-                                 struct vgcreate_params *vp_def)
+                                 struct vgcreate_params *vp_def,
+                                 struct pvcreate_params *pp)
  {
         const char *system_id_arg_str;
         const char *lock_type = NULL;
@@ -736,6 +737,29 @@ int vgcreate_params_set_from_args(struct cmd_context *cmd,
         vp_new->lock_type = lock_type;
  
         log_debug("Setting lock_type to %s", vp_new->lock_type);
+
+       if (arg_is_set(cmd, setlockargs_ARG)) {
+               const char *set_args;
+               uint32_t lock_args_flags = 0;
+
+               if (!lock_type || strcmp(lock_type, "sanlock")) {
+                       log_error("Using setlockargs requires sanlock lock type for shared VG.");
+                       return 0;
+               }
+
+               if (!(set_args = arg_str_value(cmd, setlockargs_ARG, NULL)))
+                       return_0;
+               if (!lockd_lockargs_get_user_flags(set_args, &lock_args_flags))
+                       return_0;
+               if (!pp)
+                       return_0;
+
+               if ((lock_args_flags & LOCKARGS_PERSIST) && !(pp->setpersist_flags & (SETPR_Y | SETPR_REQUIRE))) {
+                       log_error("Using --setlockargs persist requires --setpersist y|require.");
+                       return 0;
+               }
+       }
+
         return 1;
  }
  
diff --git a/tools/toollib.h b/tools/toollib.h

index d2033fb0f0680a8a4c5fada2042deea2938661aa..b9428e4781e2a09acf9c64c700bb6df30f308ab6 100644 (file)
--- a/tools/toollib.h
+++ b/tools/toollib.h
@@ -188,7 +188,8 @@ int vgcreate_params_set_defaults(struct cmd_context *cmd,
                                  struct volume_group *vg);
  int vgcreate_params_set_from_args(struct cmd_context *cmd,
                                   struct vgcreate_params *vp_new,
-                                 struct vgcreate_params *vp_def);
+                                 struct vgcreate_params *vp_def,
+                                 struct pvcreate_params *pp);
  int lv_change_activate(struct cmd_context *cmd, struct logical_volume *lv,
                        activation_change_t activate);
  int lv_refresh(struct cmd_context *cmd, struct logical_volume *lv);
diff --git a/tools/tools.h b/tools/tools.h

index 2d172432a31be1c56377a8cafc61f107b691fe18..c7afc5a033ef8b6ced1bd7d219db0f362aa357ec 100644 (file)
--- a/tools/tools.h
+++ b/tools/tools.h
@@ -175,6 +175,7 @@ int vgchange_lock_start_stop_cmd(struct cmd_context *cmd, int argc, char **argv)
  int vgchange_systemid_cmd(struct cmd_context *cmd, int argc, char **argv);
  int vgchange_setpersist_cmd(struct cmd_context *cmd, int argc, char **argv);
  int vgchange_persist_cmd(struct cmd_context *cmd, int argc, char **argv);
+int vgchange_setlockargs_cmd(struct cmd_context *cmd, int argc, char **argv);
  
  const struct opt_name *get_opt_name(int opt);
  const struct val_name *get_val_name(int val);
diff --git a/tools/vgchange.c b/tools/vgchange.c

index b28af4f9b670ba1514094a014f918d987fda1068..8b2e03f902f168db2aaf1bca6a1e8dcc8c93548a 100644 (file)
--- a/tools/vgchange.c
+++ b/tools/vgchange.c
@@ -683,6 +683,7 @@ static int _passes_lock_start_filter(struct cmd_context *cmd,
  static int _vgchange_lock_start(struct cmd_context *cmd, struct volume_group *vg,
                                 struct vgchange_params *vp)
  {
+       uint64_t our_key = 0;
         int auto_opt = 0;
         int exists = 0;
         int r;
@@ -713,12 +714,12 @@ do_start:
         if (!persist_start_include(cmd, vg, 0, auto_opt, NULL))
                 return 0;
  
-       if ((vg->pr & (VG_PR_REQUIRE|VG_PR_AUTOSTART)) && !persist_is_started(cmd, vg, 0)) {
+       if ((vg->pr & (VG_PR_REQUIRE|VG_PR_AUTOSTART)) && !persist_is_started(cmd, vg, 0, &our_key)) {
                 log_error("VG %s PR should be started before locking (vgchange --persist start)", vg->name);
                 return 0;
         }
  
-       r = lockd_start_vg(cmd, vg, &exists);
+       r = lockd_start_vg(cmd, vg, our_key, &exists);
  
         if (r)
                 vp->lock_start_count++;
@@ -1339,7 +1340,7 @@ static int _vgchange_locktype(struct cmd_context *cmd, struct volume_group *vg,
  
                 vg->system_id = NULL;
  
-               if (!lockd_init_vg(cmd, vg, lock_type, lv_lock_count)) {
+               if (!lockd_init_vg(cmd, vg, lock_type, lv_lock_count, arg_str_value(cmd, setlockargs_ARG, NULL))) {
                         log_error("Failed to initialize lock args for lock type %s", lock_type);
                         return 0;
                 }
@@ -1879,7 +1880,7 @@ static int _vgchange_setpersist_single(struct cmd_context *cmd, const char *vg_n
          * enabling/starting PR, otherwise enabling/starting PR will
          * cause i/o to begin failing on those other hosts.
          */
-       if (on && vg_is_shared(vg) && !persist_is_started(cmd, vg, 1) &&
+       if (on && vg_is_shared(vg) && !persist_is_started(cmd, vg, 1, NULL) &&
             lockd_vg_is_started(cmd, vg, NULL) && lockd_vg_is_busy(cmd, vg)) {
                 log_error("VG lockspace should be stopped on all hosts (vgchange --lockstop) before enabling PR.");
                 return ECMD_FAILED;
@@ -1949,3 +1950,51 @@ int vgchange_setpersist_cmd(struct cmd_context *cmd, int argc, char **argv)
         return ret;
  }
  
+static int _vgchange_setlockargs_single(struct cmd_context *cmd, const char *vg_name,
+                                    struct volume_group *vg,
+                                    struct processing_handle *handle)
+{
+       const char *set = arg_str_value(cmd, setlockargs_ARG, NULL);
+       uint64_t our_key_held = 0;
+
+       if (!set)
+               return_ECMD_FAILED;
+
+       /*
+        * lockd_setlockargs gets exclusive PR (if the VG is using PR),
+        * stops the lockspace, and sets new vg->lock_args that are
+        * written below.  If lockd_setlockargs got the ex PR, then
+        * persist_upgrade_stop releases the PR.
+        */
+       if (!lockd_setlockargs(cmd, vg, set, &our_key_held))
+               return_ECMD_FAILED;
+
+       if (!vg_write(vg) || !vg_commit(vg))
+               return_ECMD_FAILED;
+
+       if (our_key_held && !persist_upgrade_stop(cmd, vg, our_key_held))
+               log_warn("Failed to stop PR.");
+       persist_key_file_remove(cmd, vg);
+
+       log_print_unless_silent("Volume group \"%s\" successfully changed.", vg->name);
+
+       return ECMD_PROCESSED;
+}
+
+int vgchange_setlockargs_cmd(struct cmd_context *cmd, int argc, char **argv)
+{
+       struct processing_handle *handle;
+       uint32_t flags = READ_FOR_UPDATE;
+       int ret;
+
+       if (!(handle = init_processing_handle(cmd, NULL))) {
+               log_error("Failed to initialize processing handle.");
+               return ECMD_FAILED;
+       }
+
+       ret = process_each_vg(cmd, argc, argv, NULL, NULL, flags, 0, handle, &_vgchange_setlockargs_single);
+
+       destroy_processing_handle(cmd, handle);
+       return ret;
+}
+
diff --git a/tools/vgcreate.c b/tools/vgcreate.c

index d1cdfea1745027db392e248f0893c61374dba7ad..1d06d416e711d2a03b2f4dddc762b8d6f1111690 100644 (file)
--- a/tools/vgcreate.c
+++ b/tools/vgcreate.c
@@ -46,13 +46,12 @@ int vgcreate(struct cmd_context *cmd, int argc, char **argv)
         pp.pv_names = argv;
         pp.vg_name = vg_name;
         pp.preserve_existing = 1; /* Don't create a new PV on top of an existing PV like pvcreate does. */
-
         pp.check_consistent_block_size = 1;
  
         if (!vgcreate_params_set_defaults(cmd, &vp_def, NULL))
                 return EINVALID_CMD_LINE;
         vp_def.vg_name = vg_name;
-       if (!vgcreate_params_set_from_args(cmd, &vp_new, &vp_def))
+       if (!vgcreate_params_set_from_args(cmd, &vp_new, &vp_def, &pp))
                 return EINVALID_CMD_LINE;
  
         if (!vgcreate_params_validate(cmd, &vp_new))
@@ -161,7 +160,7 @@ int vgcreate(struct cmd_context *cmd, int argc, char **argv)
          * a local VG.  lockd_init_vg() then writes the VG a second time with
          * both lock_type and lock_args set.
          */
-       if (!lockd_init_vg(cmd, vg, vp_new.lock_type, 0)) {
+       if (!lockd_init_vg(cmd, vg, vp_new.lock_type, 0, arg_str_value(cmd, setlockargs_ARG, NULL))) {
                 log_error("Failed to initialize lock args for lock type %s",
                           vp_new.lock_type);
                 vg_remove_pvs(vg);
@@ -182,13 +181,15 @@ int vgcreate(struct cmd_context *cmd, int argc, char **argv)
          * read without locks until the lockspace is done starting.)
          */
         if (vg_is_shared(vg)) {
+               uint64_t our_key = 0;
+
                 if (pp.setpersist_flags &&
-                   !persist_vgcreate_update(cmd, vg, pp.setpersist_flags)) {
+                   !persist_vgcreate_update(cmd, vg, pp.setpersist_flags, &our_key)) {
                         log_error("Failed to start PR");
                         goto out;
                 }
  
-               if (!lockd_start_vg(cmd, vg, NULL)) {
+               if (!lockd_start_vg(cmd, vg, our_key, NULL)) {
                         log_error("Failed to start locking");
                         goto out;
                 }
diff --git a/tools/vgsplit.c b/tools/vgsplit.c

index 8fcfa6ec31b95fca81e39c6d0291052dfa8622fb..73f058a56582ff54a73e1b5022add4e5ec6e544e 100644 (file)
--- a/tools/vgsplit.c
+++ b/tools/vgsplit.c
@@ -609,7 +609,7 @@ int vgsplit(struct cmd_context *cmd, int argc, char **argv)
                         goto_bad;
                 }
                 vp_def.vg_name = vg_name_to;
-               if (!vgcreate_params_set_from_args(cmd, &vp_new, &vp_def)) {
+               if (!vgcreate_params_set_from_args(cmd, &vp_new, &vp_def, NULL)) {
                         r = EINVALID_CMD_LINE;
                         goto_bad;
                 }
author	David Teigland <teigland@redhat.com>
	Wed, 13 Aug 2025 14:57:20 +0000 (09:57 -0500)
committer	David Teigland <teigland@redhat.com>
	Fri, 3 Oct 2025 14:57:10 +0000 (09:57 -0500)
configure.ac		patch \| blob \| history
daemons/lvmlockd/Makefile.in		patch \| blob \| history
daemons/lvmlockd/lvmlockd-client.h		patch \| blob \| history
daemons/lvmlockd/lvmlockd-core.c		patch \| blob \| history
daemons/lvmlockd/lvmlockd-dlm.c		patch \| blob \| history
daemons/lvmlockd/lvmlockd-helper.c	[new file with mode: 0644]	patch \| blob
daemons/lvmlockd/lvmlockd-internal.h		patch \| blob \| history
daemons/lvmlockd/lvmlockd-sanlock.c		patch \| blob \| history
lib/device/persist.c		patch \| blob \| history
lib/device/persist.h		patch \| blob \| history
lib/locking/lvmlockd.c		patch \| blob \| history
lib/locking/lvmlockd.h		patch \| blob \| history
lib/metadata/metadata.c		patch \| blob \| history
man/lvmlockd.8_main		patch \| blob \| history
tools/args.h		patch \| blob \| history
tools/command-lines.in		patch \| blob \| history
tools/lvmcmdline.c		patch \| blob \| history
tools/toollib.c		patch \| blob \| history
tools/toollib.h		patch \| blob \| history
tools/tools.h		patch \| blob \| history
tools/vgchange.c		patch \| blob \| history
tools/vgcreate.c		patch \| blob \| history
tools/vgsplit.c		patch \| blob \| history