From ac53ca37ebbe85743424fb8ce91fd97ec57376a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Border=C3=A9?= Date: Fri, 19 Nov 2021 12:38:59 +0100 Subject: [PATCH 1/2] convert: Pre-vote should not be set when disrupt_leader is set. By definition we have 'approval' to disrupt the leader, so disable the pre-vote algorithm that actually prevents leader disruptions. Will still prevent issue https://github.com/canonical/raft/issues/203 that is covered in the tests. --- src/convert.c | 2 +- src/recv_request_vote.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/convert.c b/src/convert.c index 984fe4206..57c3d90ba 100644 --- a/src/convert.c +++ b/src/convert.c @@ -156,7 +156,7 @@ int convertToCandidate(struct raft *r, bool disrupt_leader) return RAFT_NOMEM; } r->candidate_state.disrupt_leader = disrupt_leader; - r->candidate_state.in_pre_vote = r->pre_vote; + r->candidate_state.in_pre_vote = disrupt_leader ? false : r->pre_vote; /* Fast-forward to leader if we're the only voting server in the * configuration. */ diff --git a/src/recv_request_vote.c b/src/recv_request_vote.c index 3cfd1cbf2..b5749c0b2 100644 --- a/src/recv_request_vote.c +++ b/src/recv_request_vote.c @@ -64,7 +64,7 @@ int recvRequestVote(struct raft *r, /* If this is a pre-vote request, don't actually increment our term or * persist the vote. */ - if (args->pre_vote && !args->disrupt_leader) { + if (args->pre_vote) { recvCheckMatchingTerms(r, args->term, &match); } else { rv = recvEnsureMatchingTerms(r, args->term, &match); From 98d8f51231ce214ee5df2f68b406abb7f7dd254b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Border=C3=A9?= Date: Fri, 19 Nov 2021 12:27:09 +0100 Subject: [PATCH 2/2] election: Don't reset vote during pre-vote candidacy. Resetting our vote could lead to double-voting. --- src/election.c | 34 ++++++++++++-------- test/integration/test_election.c | 55 +++++++++++++------------------- 2 files changed, 42 insertions(+), 47 deletions(-) diff --git a/src/election.c b/src/election.c index d19ab066e..e3d5fa21f 100644 --- a/src/election.c +++ b/src/election.c @@ -119,19 +119,11 @@ int electionStart(struct raft *r) assert(n_voters <= r->configuration.n); assert(voting_index < n_voters); - /* During pre-vote we don't actually increment term or persist vote, however - * we reset any vote that we previously granted since we have timed out and - * that vote is no longer valid. */ - if (r->candidate_state.in_pre_vote && !r->candidate_state.disrupt_leader) { - /* Reset vote */ - rv = r->io->set_vote(r->io, 0); - if (rv != 0) { - tracef("set_vote failed %d", rv); - goto err; - } - /* Update our cache too. */ - r->voted_for = 0; - } else { + /* During pre-vote we don't increment our term, or reset our vote. Resetting + * our vote could lead to double-voting if we were to receive a RequestVote + * RPC during our Candidate state while we already voted for a server during + * the term. */ + if (!r->candidate_state.in_pre_vote) { /* Increment current term */ term = r->current_term + 1; rv = r->io->set_term(r->io, term); @@ -210,12 +202,26 @@ int electionVote(struct raft *r, is_transferee = r->transfer != NULL && r->transfer->id == args->candidate_id; - if (r->voted_for != 0 && r->voted_for != args->candidate_id && + if (!args->pre_vote && r->voted_for != 0 && r->voted_for != args->candidate_id && !is_transferee) { tracef("local server already voted -> not granting vote"); return 0; } + /* Raft Dissertation 9.6: + * > In the Pre-Vote algorithm, a candidate + * > only increments its term if it first learns from a majority of the + * > cluster that they would be willing + * > to grant the candidate their votes (if the candidate's log is + * > sufficiently up-to-date, and the voters + * > have not received heartbeats from a valid leader for at least a baseline + * > election timeout) + * Arriving here means that in a pre-vote phase, we will cast our vote + * if the candidate's log is sufficiently up-to-date, no matter what the + * candidate's term is. We have already checked if we currently have a leader + * upon reception of the RequestVote RPC, meaning the 2 conditions will be + * satisfied if the candidate's log is up-to-date. + * */ local_last_index = logLastIndex(&r->log); /* Our log is definitely not more up-to-date if it's empty! */ diff --git a/test/integration/test_election.c b/test/integration/test_election.c index 063248de9..05a8ec353 100644 --- a/test/integration/test_election.c +++ b/test/integration/test_election.c @@ -659,7 +659,7 @@ TEST(election, preVoteWithcandidateCrash, setUp, tearDown, 0, cluster_3_params) ASSERT_TERM(0, 2); /* Server 0 has now incremented its term. */ ASSERT_TIME(1030); - /* Server 1 completes sending actual RequestVote RPCs */ + /* Server 0 completes sending actual RequestVote RPCs */ CLUSTER_STEP_N(2); CLUSTER_STEP; /* Server 1 receives the actual RequestVote RPC */ @@ -673,47 +673,36 @@ TEST(election, preVoteWithcandidateCrash, setUp, tearDown, 0, cluster_3_params) /* Server 0 crashes. */ CLUSTER_KILL(0); - /* Server 1 times out and starts an election. It doesn't increment its term - * yet but it reset its vote since it's beginning the pre-vote phase. */ + /* Server 1 times out and starts an election. + * It doesn't increment its term */ STEP_UNTIL_CANDIDATE(1); ASSERT_TIME(2200); ASSERT_TERM(1, 2); - ASSERT_VOTED_FOR(1, 0); - /* Since server 2 has already voted for server 0, it doesn't grant its vote - * and eventually times out and becomes candidate, resetting its vote as - * well. */ - STEP_UNTIL_CANDIDATE(2); - ASSERT_TIME(2300); - ASSERT_TERM(2, 2); - ASSERT_VOTED_FOR(2, 0); + /* Server 1 completes sending the pre-vote RequestVote RPCs and server 2 has + * received those RPCs. + * Since server 2 has no current leader (the leader crashed before sending a + * HeartBeat), it will grant its vote to server 1, but will not persist it + * due to pre-vote, it's persisted vote is still for Server 0 (id 1) */ + CLUSTER_STEP_N(5); + ASSERT_TERM(2, 2); /* Server 2 does not increment its term */ + ASSERT_VOTED_FOR(2, 1); - /* Server 2 completes sending the pre-vote RequestVote RPCs */ + /* Server 1 receives the pre-vote RequestVote Result */ CLUSTER_STEP_N(2); + /* Server 1 increments it's term to start a non pre-vote election */ + ASSERT_TERM(1, 3); /* Server 1 has now incremented its term. */ + ASSERT_VOTED_FOR(1, 2); /* Server 1 has persisted its vote */ + ASSERT_TIME(2230); - CLUSTER_STEP; /* Server 1 receives the pre-vote RequestVote RPC */ - ASSERT_TERM(1, 2); /* Server 1 does not increment its term */ - ASSERT_VOTED_FOR(1, 0); /* Server 1 does not persist its vote */ - - /* Server 1 completes sending pre-vote RequestVote results */ + /* Server 1 completes sending actual RequestVote RPCs */ CLUSTER_STEP_N(2); - - /* Server 2 receives the pre-vote RequestVote results */ - CLUSTER_STEP; - ASSERT_CANDIDATE(2); - ASSERT_TERM(2, 3); /* Server 2 has now incremented its term. */ - ASSERT_TIME(2330); - - /* Server 2 completes sending actual RequestVote RPCs */ + /* Server 2 receives the actual RequestVote RPCs */ CLUSTER_STEP_N(2); + ASSERT_VOTED_FOR(2, 2); /* Server 2 persists its vote */ - CLUSTER_STEP_N(2); /* Server 1 receives the actual RequestVote RPC */ - ASSERT_TERM(1, 3); /* Server 1 does increment its term. */ - ASSERT_VOTED_FOR(1, 3); /* Server 1 does persists its vote */ - - CLUSTER_STEP; /* Server 1 completes sending actual RequestVote result */ - CLUSTER_STEP; /* Server 2 receives the actual RequestVote result */ - ASSERT_LEADER(2); - + /* Server 1 receives RequestVote RPCs results and becomes leader */ + CLUSTER_STEP_N(2); + ASSERT_LEADER(1); return MUNIT_OK; }