Skip to content

Commit

Permalink
Add more debug logs to understand if statement-distribution misbehavi…
Browse files Browse the repository at this point in the history
…ng (#3419)

Add more debug logs to understand if statement-distribution is in a bad
state, should be useful for debugging
#3314 on production
networks.

Additionally, increase the number of parallel requests should make,
since I notice that requests take around 100ms on kusama, and the 5
parallel request was picked mostly random, no reason why we can do more
than that.

---------

Signed-off-by: Alexandru Gheorghe <[email protected]>
Co-authored-by: ordian <[email protected]>
  • Loading branch information
alexggh and ordian authored Feb 26, 2024
1 parent ce5de99 commit b9c792a
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 3 deletions.
25 changes: 24 additions & 1 deletion polkadot/node/network/statement-distribution/src/v2/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@
//! and to keep track of what we have sent to other validators in the group and what we may
//! continue to send them.
use polkadot_primitives::{CandidateHash, CompactStatement, ValidatorIndex};
use polkadot_primitives::{CandidateHash, CompactStatement, Hash, ValidatorIndex};

use crate::LOG_TARGET;
use std::collections::{HashMap, HashSet};

#[derive(Hash, PartialEq, Eq)]
Expand Down Expand Up @@ -424,6 +425,28 @@ impl ClusterTracker {
fn is_in_group(&self, validator: ValidatorIndex) -> bool {
self.validators.contains(&validator)
}

/// Dumps pending statement for this cluster.
///
/// Normally we should not have pending statements to validators in our cluster,
/// but if we do for all validators in our cluster, then we don't participate
/// in backing. Ocasional pending statements are expected if two authorities
/// can't detect each otehr or after restart, where it takes a while to discover
/// the whole network.
pub fn warn_if_too_many_pending_statements(&self, parent_hash: Hash) {
if self.pending.iter().filter(|pending| !pending.1.is_empty()).count() >=
self.validators.len()
{
gum::warn!(
target: LOG_TARGET,
pending_statements = ?self.pending,
?parent_hash,
"Cluster has too many pending statements, something wrong with our connection to our group peers \n
Restart might be needed if validator gets 0 backing rewards for more than 3-4 consecutive sessions"
);
}
}
}

/// Incoming statement was accepted.
Expand Down
17 changes: 16 additions & 1 deletion polkadot/node/network/statement-distribution/src/v2/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,13 @@ impl PerSessionState {
if local_index.is_some() {
self.local_validator.get_or_insert(LocalValidatorIndex::Inactive);
}

gum::info!(
target: LOG_TARGET,
index_in_gossip_topology = ?local_index,
index_in_parachain_authorities = ?self.local_validator,
"Node uses the following topology indices"
);
}

/// Returns `true` if local is neither active or inactive validator node.
Expand Down Expand Up @@ -768,7 +775,15 @@ pub(crate) fn handle_deactivate_leaves(state: &mut State, leaves: &[Hash]) {
let pruned = state.implicit_view.deactivate_leaf(*leaf);
for pruned_rp in pruned {
// clean up per-relay-parent data based on everything removed.
state.per_relay_parent.remove(&pruned_rp);
state
.per_relay_parent
.remove(&pruned_rp)
.as_ref()
.and_then(|pruned| pruned.active_validator_state())
.map(|active_state| {
active_state.cluster_tracker.warn_if_too_many_pending_statements(pruned_rp)
});

// clean up requests related to this relay parent.
state.request_manager.remove_by_relay_parent(*leaf);
}
Expand Down
14 changes: 13 additions & 1 deletion polkadot/node/network/statement-distribution/src/v2/requests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,16 @@ impl RequestManager {
request_props: impl Fn(&CandidateIdentifier) -> Option<RequestProperties>,
peer_advertised: impl Fn(&CandidateIdentifier, &PeerId) -> Option<StatementFilter>,
) -> Option<OutgoingRequest<AttestedCandidateRequest>> {
if response_manager.len() >= MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize {
// The number of parallel requests a node can answer is limited by
// `MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS`, however there is no
// need for the current node to limit itself to the same amount the
// requests, because the requests are going to different nodes anyways.
// While looking at https://github.com/paritytech/polkadot-sdk/issues/3314,
// found out that this requests take around 100ms to fullfill, so it
// would make sense to try to request things as early as we can, given
// we would need to request it for each candidate, around 25 right now
// on kusama.
if response_manager.len() >= 2 * MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize {
return None
}

Expand Down Expand Up @@ -1027,6 +1036,7 @@ mod tests {
let peer_advertised = |_identifier: &CandidateIdentifier, _peer: &_| {
Some(StatementFilter::full(group_size))
};

let outgoing = request_manager
.next_request(&mut response_manager, request_props, peer_advertised)
.unwrap();
Expand Down Expand Up @@ -1148,6 +1158,7 @@ mod tests {
{
let request_props =
|_identifier: &CandidateIdentifier| Some((&request_properties).clone());

let outgoing = request_manager
.next_request(&mut response_manager, request_props, peer_advertised)
.unwrap();
Expand Down Expand Up @@ -1230,6 +1241,7 @@ mod tests {
{
let request_props =
|_identifier: &CandidateIdentifier| Some((&request_properties).clone());

let outgoing = request_manager
.next_request(&mut response_manager, request_props, peer_advertised)
.unwrap();
Expand Down

0 comments on commit b9c792a

Please sign in to comment.