From cbbe2190155a60751efa2e24dbab8b186789ede7 Mon Sep 17 00:00:00 2001 From: QuantumEnergyE Date: Sat, 2 Feb 2019 15:27:08 +0800 Subject: [PATCH] ctdb_mutex_ceph_rados_helper: fix deadlock via lock renewals --- .../utils/ceph/ctdb_mutex_ceph_rados_helper.c | 200 +++++++++++++----- ctdb/utils/ceph/test_ceph_rados_reclock.sh | 57 ++++- 2 files changed, 205 insertions(+), 52 deletions(-) diff --git a/ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c b/ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c index 326a0b02519..7ef76c26e02 100644 --- a/ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c +++ b/ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c @@ -1,7 +1,7 @@ /* CTDB mutex helper using Ceph librados locks - Copyright (C) David Disseldorp 2016 + Copyright (C) David Disseldorp 2016-2018 Based on ctdb_mutex_fcntl_helper.c, which is: Copyright (C) Martin Schwenke 2015 @@ -29,6 +29,11 @@ #define CTDB_MUTEX_CEPH_LOCK_NAME "ctdb_reclock_mutex" #define CTDB_MUTEX_CEPH_LOCK_COOKIE CTDB_MUTEX_CEPH_LOCK_NAME #define CTDB_MUTEX_CEPH_LOCK_DESC "CTDB recovery lock" +/* + * During failover it may take up to seconds before the + * newly elected recovery master can obtain the lock. + */ +#define CTDB_MUTEX_CEPH_LOCK_DURATION_SECS_DEFAULT 10 #define CTDB_MUTEX_STATUS_HOLDING "0" #define CTDB_MUTEX_STATUS_CONTENDED "1" @@ -88,24 +93,20 @@ static int ctdb_mutex_rados_ctx_create(const char *ceph_cluster_name, return 0; } -static void ctdb_mutex_rados_ctx_destroy(rados_t ceph_cluster, - rados_ioctx_t ioctx) -{ - rados_ioctx_destroy(ioctx); - rados_shutdown(ceph_cluster); -} - static int ctdb_mutex_rados_lock(rados_ioctx_t *ioctx, - const char *oid) + const char *oid, + uint64_t lock_duration_s, + uint8_t flags) { int ret; + struct timeval tv = { lock_duration_s, 0 }; ret = rados_lock_exclusive(ioctx, oid, - CTDB_MUTEX_CEPH_LOCK_NAME, + CTDB_MUTEX_CEPH_LOCK_NAME, CTDB_MUTEX_CEPH_LOCK_COOKIE, CTDB_MUTEX_CEPH_LOCK_DESC, - NULL, /* infinite duration */ - 0); + lock_duration_s == 0 ? NULL : &tv, + flags); if ((ret == -EEXIST) || (ret == -EBUSY)) { /* lock contention */ return ret; @@ -145,10 +146,13 @@ struct ctdb_mutex_rados_state { const char *ceph_auth_name; const char *pool_name; const char *object; + uint64_t lock_duration_s; int ppid; struct tevent_context *ev; - struct tevent_signal *sig_ev; - struct tevent_timer *timer_ev; + struct tevent_signal *sigterm_ev; + struct tevent_signal *sigint_ev; + struct tevent_timer *ppid_timer_ev; + struct tevent_timer *renew_timer_ev; rados_t ceph_cluster; rados_ioctx_t ioctx; }; @@ -161,29 +165,24 @@ static void ctdb_mutex_rados_sigterm_cb(struct tevent_context *ev, void *private_data) { struct ctdb_mutex_rados_state *cmr_state = private_data; - int ret; + int ret = 0; if (!cmr_state->holding_mutex) { fprintf(stderr, "Sigterm callback invoked without mutex!\n"); ret = -EINVAL; - goto err_ctx_cleanup; } - ret = ctdb_mutex_rados_unlock(cmr_state->ioctx, cmr_state->object); -err_ctx_cleanup: - ctdb_mutex_rados_ctx_destroy(cmr_state->ceph_cluster, - cmr_state->ioctx); talloc_free(cmr_state); exit(ret ? 1 : 0); } -static void ctdb_mutex_rados_timer_cb(struct tevent_context *ev, - struct tevent_timer *te, - struct timeval current_time, - void *private_data) +static void ctdb_mutex_rados_ppid_timer_cb(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval current_time, + void *private_data) { struct ctdb_mutex_rados_state *cmr_state = private_data; - int ret; + int ret = 0; if (!cmr_state->holding_mutex) { fprintf(stderr, "Timer callback invoked without mutex!\n"); @@ -193,26 +192,81 @@ static void ctdb_mutex_rados_timer_cb(struct tevent_context *ev, if ((kill(cmr_state->ppid, 0) == 0) || (errno != ESRCH)) { /* parent still around, keep waiting */ - cmr_state->timer_ev = tevent_add_timer(cmr_state->ev, cmr_state, + cmr_state->ppid_timer_ev = tevent_add_timer(cmr_state->ev, + cmr_state, tevent_timeval_current_ofs(5, 0), - ctdb_mutex_rados_timer_cb, - cmr_state); - if (cmr_state->timer_ev == NULL) { + ctdb_mutex_rados_ppid_timer_cb, + cmr_state); + if (cmr_state->ppid_timer_ev == NULL) { fprintf(stderr, "Failed to create timer event\n"); /* rely on signal cb */ } return; } - /* parent ended, drop lock and exit */ - ret = ctdb_mutex_rados_unlock(cmr_state->ioctx, cmr_state->object); + /* parent ended, drop lock (via destructor) and exit */ err_ctx_cleanup: - ctdb_mutex_rados_ctx_destroy(cmr_state->ceph_cluster, - cmr_state->ioctx); talloc_free(cmr_state); exit(ret ? 1 : 0); } +#define USECS_IN_SEC 1000000 + +static void ctdb_mutex_rados_lock_renew_timer_cb(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval current_time, + void *private_data) +{ + struct ctdb_mutex_rados_state *cmr_state = private_data; + struct timeval tv; + int ret; + + ret = ctdb_mutex_rados_lock(cmr_state->ioctx, cmr_state->object, + cmr_state->lock_duration_s, + LIBRADOS_LOCK_FLAG_RENEW); + if (ret == -EBUSY) { + /* should never get -EEXIST on renewal */ + fprintf(stderr, "Lock contention during renew: %d\n", ret); + goto err_ctx_cleanup; + } else if (ret < 0) { + fprintf(stderr, "Lock renew failed\n"); + goto err_ctx_cleanup; + } + + tv = tevent_timeval_current_ofs(0, + cmr_state->lock_duration_s * (USECS_IN_SEC / 2)); + cmr_state->renew_timer_ev = tevent_add_timer(cmr_state->ev, + cmr_state, + tv, + ctdb_mutex_rados_lock_renew_timer_cb, + cmr_state); + if (cmr_state->renew_timer_ev == NULL) { + fprintf(stderr, "Failed to create timer event\n"); + goto err_ctx_cleanup; + } + + return; + +err_ctx_cleanup: + /* drop lock (via destructor) and exit */ + talloc_free(cmr_state); + exit(1); +} + +static int ctdb_mutex_rados_state_destroy(struct ctdb_mutex_rados_state *cmr_state) +{ + if (cmr_state->holding_mutex) { + ctdb_mutex_rados_unlock(cmr_state->ioctx, cmr_state->object); + } + if (cmr_state->ioctx != NULL) { + rados_ioctx_destroy(cmr_state->ioctx); + } + if (cmr_state->ceph_cluster != NULL) { + rados_shutdown(cmr_state->ceph_cluster); + } + return 0; +} + int main(int argc, char *argv[]) { int ret; @@ -220,9 +274,10 @@ int main(int argc, char *argv[]) progname = argv[0]; - if (argc != 5) { + if ((argc != 5) && (argc != 6)) { fprintf(stderr, "Usage: %s " - " \n", + " " + "[lock duration secs]\n", progname); ret = -EINVAL; goto err_out; @@ -240,10 +295,24 @@ int main(int argc, char *argv[]) goto err_out; } + talloc_set_destructor(cmr_state, ctdb_mutex_rados_state_destroy); cmr_state->ceph_cluster_name = argv[1]; cmr_state->ceph_auth_name = argv[2]; cmr_state->pool_name = argv[3]; cmr_state->object = argv[4]; + if (argc == 6) { + /* optional lock duration provided */ + char *endptr = NULL; + cmr_state->lock_duration_s = strtoull(argv[5], &endptr, 0); + if ((endptr == argv[5]) || (*endptr != '\0')) { + fprintf(stdout, CTDB_MUTEX_STATUS_ERROR); + ret = -EINVAL; + goto err_ctx_cleanup; + } + } else { + cmr_state->lock_duration_s + = CTDB_MUTEX_CEPH_LOCK_DURATION_SECS_DEFAULT; + } cmr_state->ppid = getppid(); if (cmr_state->ppid == 1) { @@ -257,7 +326,7 @@ int main(int argc, char *argv[]) */ fprintf(stderr, "%s: PPID == 1\n", progname); ret = -EPIPE; - goto err_state_free; + goto err_ctx_cleanup; } cmr_state->ev = tevent_context_init(cmr_state); @@ -265,30 +334,40 @@ int main(int argc, char *argv[]) fprintf(stderr, "tevent_context_init failed\n"); fprintf(stdout, CTDB_MUTEX_STATUS_ERROR); ret = -ENOMEM; - goto err_state_free; + goto err_ctx_cleanup; } /* wait for sigterm */ - cmr_state->sig_ev = tevent_add_signal(cmr_state->ev, cmr_state, SIGTERM, 0, + cmr_state->sigterm_ev = tevent_add_signal(cmr_state->ev, cmr_state, SIGTERM, 0, ctdb_mutex_rados_sigterm_cb, cmr_state); - if (cmr_state->sig_ev == NULL) { - fprintf(stderr, "Failed to create signal event\n"); + if (cmr_state->sigterm_ev == NULL) { + fprintf(stderr, "Failed to create term signal event\n"); fprintf(stdout, CTDB_MUTEX_STATUS_ERROR); ret = -ENOMEM; - goto err_state_free; + goto err_ctx_cleanup; + } + + cmr_state->sigint_ev = tevent_add_signal(cmr_state->ev, cmr_state, SIGINT, 0, + ctdb_mutex_rados_sigterm_cb, + cmr_state); + if (cmr_state->sigint_ev == NULL) { + fprintf(stderr, "Failed to create int signal event\n"); + fprintf(stdout, CTDB_MUTEX_STATUS_ERROR); + ret = -ENOMEM; + goto err_ctx_cleanup; } /* periodically check parent */ - cmr_state->timer_ev = tevent_add_timer(cmr_state->ev, cmr_state, + cmr_state->ppid_timer_ev = tevent_add_timer(cmr_state->ev, cmr_state, tevent_timeval_current_ofs(5, 0), - ctdb_mutex_rados_timer_cb, + ctdb_mutex_rados_ppid_timer_cb, cmr_state); - if (cmr_state->timer_ev == NULL) { + if (cmr_state->ppid_timer_ev == NULL) { fprintf(stderr, "Failed to create timer event\n"); fprintf(stdout, CTDB_MUTEX_STATUS_ERROR); ret = -ENOMEM; - goto err_state_free; + goto err_ctx_cleanup; } ret = ctdb_mutex_rados_ctx_create(cmr_state->ceph_cluster_name, @@ -298,10 +377,12 @@ int main(int argc, char *argv[]) &cmr_state->ioctx); if (ret < 0) { fprintf(stdout, CTDB_MUTEX_STATUS_ERROR); - goto err_state_free; + goto err_ctx_cleanup; } - ret = ctdb_mutex_rados_lock(cmr_state->ioctx, cmr_state->object); + ret = ctdb_mutex_rados_lock(cmr_state->ioctx, cmr_state->object, + cmr_state->lock_duration_s, + 0); if ((ret == -EEXIST) || (ret == -EBUSY)) { fprintf(stdout, CTDB_MUTEX_STATUS_CONTENDED); goto err_ctx_cleanup; @@ -309,8 +390,28 @@ int main(int argc, char *argv[]) fprintf(stdout, CTDB_MUTEX_STATUS_ERROR); goto err_ctx_cleanup; } - cmr_state->holding_mutex = true; + + if (cmr_state->lock_duration_s != 0) { + /* + * renew (reobtain) the lock, using a period of half the lock + * duration. Convert to usecs to avoid rounding. + */ + struct timeval tv = tevent_timeval_current_ofs(0, + cmr_state->lock_duration_s * (USECS_IN_SEC / 2)); + cmr_state->renew_timer_ev = tevent_add_timer(cmr_state->ev, + cmr_state, + tv, + ctdb_mutex_rados_lock_renew_timer_cb, + cmr_state); + if (cmr_state->renew_timer_ev == NULL) { + fprintf(stderr, "Failed to create timer event\n"); + fprintf(stdout, CTDB_MUTEX_STATUS_ERROR); + ret = -ENOMEM; + goto err_ctx_cleanup; + } + } + fprintf(stdout, CTDB_MUTEX_STATUS_HOLDING); /* wait for the signal / timer events to do their work */ @@ -319,9 +420,6 @@ int main(int argc, char *argv[]) goto err_ctx_cleanup; } err_ctx_cleanup: - ctdb_mutex_rados_ctx_destroy(cmr_state->ceph_cluster, - cmr_state->ioctx); -err_state_free: talloc_free(cmr_state); err_out: return ret ? 1 : 0; diff --git a/ctdb/utils/ceph/test_ceph_rados_reclock.sh b/ctdb/utils/ceph/test_ceph_rados_reclock.sh index 1adacf6a27c..68b44ff997f 100755 --- a/ctdb/utils/ceph/test_ceph_rados_reclock.sh +++ b/ctdb/utils/ceph/test_ceph_rados_reclock.sh @@ -46,7 +46,9 @@ which ctdb_mutex_ceph_rados_helper || exit 1 TMP_DIR="$(mktemp --directory)" || exit 1 rados -p "$POOL" rm "$OBJECT" -(ctdb_mutex_ceph_rados_helper "$CLUSTER" "$USER" "$POOL" "$OBJECT" \ +# explicitly disable lock expiry (duration=0), to ensure that we don't get +# intermittent failures (due to renewal) from the lock state diff further down +(ctdb_mutex_ceph_rados_helper "$CLUSTER" "$USER" "$POOL" "$OBJECT" 0 \ > ${TMP_DIR}/first) & locker_pid=$! @@ -78,6 +80,9 @@ LOCKER_COOKIE="$(jq -r '.lockers[0].cookie' ${TMP_DIR}/lock_state_first)" LOCKER_DESC="$(jq -r '.lockers[0].description' ${TMP_DIR}/lock_state_first)" [ "$LOCKER_DESC" == "CTDB recovery lock" ] \ || _fail "unexpected locker description: $LOCKER_DESC" +LOCKER_EXP="$(jq -r '.lockers[0].expiration' ${TMP_DIR}/lock_state_first)" +[ "$LOCKER_EXP" == "0.000000" ] \ + || _fail "unexpected locker expiration: $LOCKER_EXP" # second attempt while first is still holding the lock - expect failure ctdb_mutex_ceph_rados_helper "$CLUSTER" "$USER" "$POOL" "$OBJECT" \ @@ -145,6 +150,56 @@ third_out=$(cat ${TMP_DIR}/third) [ "$third_out" == "0" ] \ || _fail "expected lock acquisition (0), but got $third_out" +# test renew / expire behaviour using a 1s expiry (update period = 500ms) +exec >${TMP_DIR}/forth -- ctdb_mutex_ceph_rados_helper "$CLUSTER" "$USER" \ + "$POOL" "$OBJECT" 1 & +locker_pid=$! + +sleep 1 + +rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \ + > ${TMP_DIR}/lock_state_fifth_a +#echo "with lock fifth: `cat ${TMP_DIR}/lock_state_fifth_a`" + +LOCK_NAME="$(jq -r '.name' ${TMP_DIR}/lock_state_fifth_a)" +[ "$LOCK_NAME" == "ctdb_reclock_mutex" ] \ + || _fail "unexpected lock name: $LOCK_NAME" +LOCK_TYPE="$(jq -r '.type' ${TMP_DIR}/lock_state_fifth_a)" +[ "$LOCK_TYPE" == "exclusive" ] \ + || _fail "unexpected lock type: $LOCK_TYPE" +LOCK_COUNT="$(jq -r '.lockers | length' ${TMP_DIR}/lock_state_fifth_a)" +[ $LOCK_COUNT -eq 1 ] || _fail "expected 1 lock in rados state, got $LOCK_COUNT" +LOCKER_EXP_A="$(jq -r '.lockers[0].expiration' ${TMP_DIR}/lock_state_fifth_a)" +[ "$LOCKER_EXP_A" != "0.000000" ] \ + || _fail "unexpected locker expiration: $LOCKER_EXP_A" +sleep 1 # sleep until renewal +rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \ + > ${TMP_DIR}/lock_state_fifth_b +LOCKER_EXP_B="$(jq -r '.lockers[0].expiration' ${TMP_DIR}/lock_state_fifth_b)" +[ "$LOCKER_EXP_B" != "0.000000" ] \ + || _fail "unexpected locker expiration: $LOCKER_EXP_B" +#echo "lock expiration before renewal $LOCKER_EXP_A, after renewal $LOCKER_EXP_B" +[ "$LOCKER_EXP_B" != "$LOCKER_EXP_A" ] \ + || _fail "locker expiration matches: $LOCKER_EXP_B" + +# no chance to drop the lock, rely on expiry +kill -KILL $locker_pid || exit 1 +wait $locker_pid &> /dev/null +sleep 1 # sleep until lock expiry + +rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \ + > ${TMP_DIR}/lock_state_sixth +#echo "lock expiry sixth: `cat ${TMP_DIR}/lock_state_sixth`" + +LOCK_NAME="$(jq -r '.name' ${TMP_DIR}/lock_state_sixth)" +[ "$LOCK_NAME" == "ctdb_reclock_mutex" ] \ + || _fail "unexpected lock name: $LOCK_NAME" +LOCK_TYPE="$(jq -r '.type' ${TMP_DIR}/lock_state_sixth)" +[ "$LOCK_TYPE" == "exclusive" ] \ + || _fail "unexpected lock type: $LOCK_TYPE" +LOCK_COUNT="$(jq -r '.lockers | length' ${TMP_DIR}/lock_state_sixth)" +[ $LOCK_COUNT -eq 0 ] || _fail "expected 0 locks in rados state, got $LOCK_COUNT" + rm ${TMP_DIR}/* rmdir $TMP_DIR