Skip to content

Commit

Permalink
[cmd/opampsupervisor] fix: Clear config timeout timer & send APPLIED …
Browse files Browse the repository at this point in the history
…remote config status when running nop config (#36733)
  • Loading branch information
dpaasman00 authored Dec 20, 2024
1 parent 08e0bb4 commit a205064
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 8 deletions.
27 changes: 27 additions & 0 deletions .chloggen/fix_supervisor-reports-error-nop-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: bug_fix

# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
component: opampsupervisor

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Report an 'Applied' remote config status when an empty config is received.

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [36682]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext:

# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: []
17 changes: 17 additions & 0 deletions cmd/opampsupervisor/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1562,6 +1562,23 @@ func TestSupervisorRemoteConfigApplyStatus(t *testing.T) {
status, ok := remoteConfigStatus.Load().(*protobufs.RemoteConfigStatus)
return ok && status.Status == protobufs.RemoteConfigStatuses_RemoteConfigStatuses_FAILED
}, 15*time.Second, 100*time.Millisecond, "Remote config status was not set to FAILED for bad config")

// Test with nop configuration
emptyHash := sha256.Sum256([]byte{})
server.sendToSupervisor(&protobufs.ServerToAgent{
RemoteConfig: &protobufs.AgentRemoteConfig{
Config: &protobufs.AgentConfigMap{
ConfigMap: map[string]*protobufs.AgentConfigFile{},
},
ConfigHash: emptyHash[:],
},
})

// Check that the status is set to APPLIED
require.Eventually(t, func() bool {
status, ok := remoteConfigStatus.Load().(*protobufs.RemoteConfigStatus)
return ok && status.Status == protobufs.RemoteConfigStatuses_RemoteConfigStatuses_APPLIED
}, 5*time.Second, 10*time.Millisecond, "Remote config status was not set to APPLIED for empty config")
}

func TestSupervisorOpAmpServerPort(t *testing.T) {
Expand Down
50 changes: 42 additions & 8 deletions cmd/opampsupervisor/supervisor/supervisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ func (c *configState) equal(other *configState) bool {
return other.mergedConfig == c.mergedConfig && other.configMapIsEmpty == c.configMapIsEmpty
}

type agentStartStatus string

var (
agentStarting agentStartStatus = "starting"
agentNotStarting agentStartStatus = "notStarting"
)

// Supervisor implements supervising of OpenTelemetry Collector and uses OpAMPClient
// to work with an OpAMP Server.
type Supervisor struct {
Expand Down Expand Up @@ -1000,22 +1007,27 @@ func (s *Supervisor) handleRestartCommand() error {
return err
}

func (s *Supervisor) startAgent() {
func (s *Supervisor) startAgent() (agentStartStatus, error) {
if s.cfgState.Load().(*configState).configMapIsEmpty {
// Don't start the agent if there is no config to run
s.logger.Info("No config present, not starting agent.")
return
// need to manually trigger updating effective config
err := s.opampClient.UpdateEffectiveConfig(context.Background())
if err != nil {
s.logger.Error("The OpAMP client failed to update the effective config", zap.Error(err))
}
return agentNotStarting, nil
}

err := s.commander.Start(context.Background())
if err != nil {
s.logger.Error("Cannot start the agent", zap.Error(err))
err = s.opampClient.SetHealth(&protobufs.ComponentHealth{Healthy: false, LastError: fmt.Sprintf("Cannot start the agent: %v", err)})
startErr := fmt.Errorf("Cannot start the agent: %w", err)
err = s.opampClient.SetHealth(&protobufs.ComponentHealth{Healthy: false, LastError: startErr.Error()})
if err != nil {
s.logger.Error("Failed to report OpAMP client health", zap.Error(err))
}

return
return "", startErr
}

s.agentHasStarted = false
Expand All @@ -1024,6 +1036,7 @@ func (s *Supervisor) startAgent() {
s.startHealthCheckTicker()

s.healthChecker = healthchecker.NewHTTPHealthChecker(fmt.Sprintf("http://%s", s.agentHealthCheckEndpoint))
return agentStarting, nil
}

func (s *Supervisor) startHealthCheckTicker() {
Expand Down Expand Up @@ -1090,7 +1103,11 @@ func (s *Supervisor) runAgentProcess() {
if _, err := os.Stat(s.agentConfigFilePath()); err == nil {
// We have an effective config file saved previously. Use it to start the agent.
s.logger.Debug("Effective config found, starting agent initial time")
s.startAgent()
_, err := s.startAgent()
if err != nil {
s.logger.Error("starting agent failed", zap.Error(err))
s.reportConfigStatus(protobufs.RemoteConfigStatuses_RemoteConfigStatuses_FAILED, err.Error())
}
}

restartTimer := time.NewTimer(0)
Expand All @@ -1114,7 +1131,17 @@ func (s *Supervisor) runAgentProcess() {
s.logger.Debug("Restarting agent due to new config")
restartTimer.Stop()
s.stopAgentApplyConfig()
s.startAgent()
status, err := s.startAgent()
if err != nil {
s.logger.Error("starting agent with new config failed", zap.Error(err))
s.reportConfigStatus(protobufs.RemoteConfigStatuses_RemoteConfigStatuses_FAILED, err.Error())
}

if status == agentNotStarting {
// not starting agent because of nop config, clear timer
configApplyTimeoutTimer.Stop()
s.reportConfigStatus(protobufs.RemoteConfigStatuses_RemoteConfigStatuses_APPLIED, "")
}

case <-s.commander.Exited():
// the agent process exit is expected for restart command and will not attempt to restart
Expand Down Expand Up @@ -1146,7 +1173,11 @@ func (s *Supervisor) runAgentProcess() {

case <-restartTimer.C:
s.logger.Debug("Agent starting after start backoff")
s.startAgent()
_, err := s.startAgent()
if err != nil {
s.logger.Error("restarting agent failed", zap.Error(err))
s.reportConfigStatus(protobufs.RemoteConfigStatuses_RemoteConfigStatuses_FAILED, err.Error())
}

case <-configApplyTimeoutTimer.C:
if s.lastHealthFromClient == nil || !s.lastHealthFromClient.Healthy {
Expand Down Expand Up @@ -1242,6 +1273,9 @@ func (s *Supervisor) saveLastReceivedOwnTelemetrySettings(set *protobufs.Telemet
}

func (s *Supervisor) reportConfigStatus(status protobufs.RemoteConfigStatuses, errorMessage string) {
if !s.config.Capabilities.ReportsRemoteConfig {
s.logger.Debug("supervisor is not configured to report remote config status")
}
err := s.opampClient.SetRemoteConfigStatus(&protobufs.RemoteConfigStatus{
LastRemoteConfigHash: s.remoteConfig.GetConfigHash(),
Status: status,
Expand Down

0 comments on commit a205064

Please sign in to comment.