cadence-workflow · bowenxia · Dec 12, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 12, 2024
@@ -2523,6 +2523,7 @@ const (
 	ReplicationDLQProbeFailed
 	ReplicationDLQSize
 	ReplicationDLQValidationFailed
+	ReplicationMessageTooLargePerShard
 	GetReplicationMessagesForShardLatency
 	GetDLQReplicationMessagesLatency
 	EventReapplySkippedCount
@@ -3217,6 +3218,7 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{
 		ReplicationDLQProbeFailed:                                    {metricName: "replication_dlq_probe_failed", metricType: Counter},
 		ReplicationDLQSize:                                           {metricName: "replication_dlq_size", metricType: Gauge},
 		ReplicationDLQValidationFailed:                               {metricName: "replication_dlq_validation_failed", metricType: Counter},
+		ReplicationMessageTooLargePerShard:                           {metricName: "replication_message_too_large_per_shard", metricType: Counter},
 		GetReplicationMessagesForShardLatency:                        {metricName: "get_replication_messages_for_shard", metricType: Timer},
 		GetDLQReplicationMessagesLatency:                             {metricName: "get_dlq_replication_messages", metricType: Timer},
 		EventReapplySkippedCount:                                     {metricName: "event_reapply_skipped_count", metricType: Counter},

@@ -1555,7 +1555,7 @@ func (h *handlerImpl) GetReplicationMessages(
 
 	h.GetLogger().Debug("Received GetReplicationMessages call.")
 
-	_, sw := h.startRequestProfile(ctx, metrics.HistoryGetReplicationMessagesScope)
+	metricsScope, sw := h.startRequestProfile(ctx, metrics.HistoryGetReplicationMessagesScope)
 	defer sw.Stop()
 
 	if h.isShuttingDown() {
@@ -1601,6 +1601,9 @@ func (h *handlerImpl) GetReplicationMessages(
 
 		size := proto.FromReplicationMessages(tasks).Size()
 		if (responseSize + size) >= maxResponseSize {
+			metricsScope.Tagged(metrics.ShardIDTag(int(shardID)))
+			metricsScope.IncCounter(metrics.ReplicationMessageTooLargePerShard)
+
 			// Log shards that did not fit for debugging purposes
 			h.GetLogger().Warn("Replication messages did not fit in the response (history host)",
 				tag.ShardID(int(shardID)),