Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standardize metrics for task isolation leaking and include cause #6562

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 10 additions & 33 deletions common/isolationgroup/defaultisolationgroupstate/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,12 @@ func NewDefaultIsolationGroupStateWatcherWithConfigStoreClient(
}, nil
}

func (z *defaultIsolationGroupStateHandler) AvailableIsolationGroupsByDomainID(ctx context.Context, domainID string, tasklistName string, availablePollerIsolationGroups []string) (types.IsolationGroupConfiguration, error) {
func (z *defaultIsolationGroupStateHandler) AvailableIsolationGroupsByDomainID(ctx context.Context, domainID string, _ string) (types.IsolationGroupConfiguration, error) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the behavior of this method changed
previously, it only returned undrained isolation groups but now it also returns drained groups
consider renaming it or doing it in a different way if you want to get drained groups

state, err := z.getByDomainID(ctx, domainID)
if err != nil {
return nil, fmt.Errorf("unable to get isolation group state: %w", err)
}
availableIsolationGroupsCfg := isolationGroupHealthyListToConfig(availablePollerIsolationGroups)
scope := z.createAvailableisolationGroupMetricsScope(domainID, tasklistName)
return availableIG(z.config.AllIsolationGroups(), availableIsolationGroupsCfg, state.Global, state.Domain, scope), nil
return availableIG(z.config.AllIsolationGroups(), state.Global, state.Domain), nil
}

func (z *defaultIsolationGroupStateHandler) IsDrained(ctx context.Context, domain string, isolationGroup string) (bool, error) {
Expand Down Expand Up @@ -162,44 +160,34 @@ func (z *defaultIsolationGroupStateHandler) get(ctx context.Context, domain stri
return ig, nil
}

func (z *defaultIsolationGroupStateHandler) createAvailableisolationGroupMetricsScope(domainID string, tasklistName string) metrics.Scope {
domainName, _ := z.domainCache.GetDomainName(domainID)
return z.metricsClient.Scope(metrics.GetAvailableIsolationGroupsScope).
Tagged(metrics.DomainTag(domainName)).
Tagged(metrics.TaskListTag(tasklistName))
}

// A simple explicit deny-based isolation group implementation
func availableIG(
allIsolationGroups []string,
availablePollers types.IsolationGroupConfiguration,
global types.IsolationGroupConfiguration,
domain types.IsolationGroupConfiguration,
scope metrics.Scope,
) types.IsolationGroupConfiguration {
out := types.IsolationGroupConfiguration{}
for _, isolationGroup := range allIsolationGroups {
_, hasAvailablePollers := availablePollers[isolationGroup]
globalCfg, hasGlobalConfig := global[isolationGroup]
domainCfg, hasDomainConfig := domain[isolationGroup]
if hasGlobalConfig {
if globalCfg.State == types.IsolationGroupStateDrained {
scope.Tagged(metrics.PollerIsolationGroupTag(isolationGroup)).IncCounter(metrics.IsolationGroupStateDrained)
out[isolationGroup] = types.IsolationGroupPartition{
Name: isolationGroup,
State: types.IsolationGroupStateDrained,
}
continue
}
}
if hasDomainConfig {
if domainCfg.State == types.IsolationGroupStateDrained {
scope.Tagged(metrics.PollerIsolationGroupTag(isolationGroup)).IncCounter(metrics.IsolationGroupStateDrained)
out[isolationGroup] = types.IsolationGroupPartition{
Name: isolationGroup,
State: types.IsolationGroupStateDrained,
}
continue
}
}
if !hasAvailablePollers {
// we don't attempt to dispatch tasks to isolation groups where there are no pollers
scope.Tagged(metrics.PollerIsolationGroupTag(isolationGroup)).IncCounter(metrics.IsolationGroupStatePollerUnavailable)
continue
}
scope.Tagged(metrics.PollerIsolationGroupTag(isolationGroup)).IncCounter(metrics.IsolationGroupStateHealthy)
out[isolationGroup] = types.IsolationGroupPartition{
Name: isolationGroup,
State: types.IsolationGroupStateHealthy,
Expand All @@ -223,14 +211,3 @@ func isDrained(isolationGroup string, global types.IsolationGroupConfiguration,
}
return false
}

func isolationGroupHealthyListToConfig(igs []string) types.IsolationGroupConfiguration {
out := make(types.IsolationGroupConfiguration, len(igs))
for _, ig := range igs {
out[ig] = types.IsolationGroupPartition{
Name: ig,
State: types.IsolationGroupStateHealthy,
}
}
return out
}
112 changes: 33 additions & 79 deletions common/isolationgroup/defaultisolationgroupstate/state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,24 +74,21 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
json.Unmarshal(validCfgDataDrained, &dynamicConfigResponseDrained)

tests := map[string]struct {
availablePollerIsolationGroups []string
dcAffordance func(client *dynamicconfig.MockClient)
domainAffordance func(mock *cache.MockDomainCache)
cfg defaultConfig
expected types.IsolationGroupConfiguration
expectedErr error
dcAffordance func(client *dynamicconfig.MockClient)
domainAffordance func(mock *cache.MockDomainCache)
cfg defaultConfig
expected types.IsolationGroupConfiguration
expectedErr error
}{
"normal case - feature is disabled": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return false },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {},
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
},
expected: types.IsolationGroupConfiguration{
"zone-1": {
Expand All @@ -106,10 +103,9 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
},

"normal case - no drains present - no configuration specifying a drain - feature is enabled": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {
client.EXPECT().GetListValue(
Expand All @@ -120,7 +116,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
mock.EXPECT().GetDomain("domain").Return(domainResponse, nil).AnyTimes()
},
expected: types.IsolationGroupConfiguration{
Expand All @@ -136,10 +131,9 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
},

"normal case - one drain present - no configuration specifying a drain - feature is enabled": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {
client.EXPECT().GetListValue(
Expand All @@ -150,21 +144,23 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
mock.EXPECT().GetDomain("domain").Return(domainResponse, nil).AnyTimes()
},
expected: types.IsolationGroupConfiguration{
"zone-1": {
Name: "zone-1",
State: types.IsolationGroupStateDrained,
},
"zone-2": {
Name: "zone-2",
State: types.IsolationGroupStateHealthy,
},
},
},
"expected case - no global drain data configured": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {
client.EXPECT().GetListValue(
Expand All @@ -175,7 +171,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
mock.EXPECT().GetDomain("domain").Return(domainResponse, nil).AnyTimes()
},
expected: types.IsolationGroupConfiguration{
Expand All @@ -190,7 +185,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
},
},
"pathological case - problems with global drain data - 1": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -210,7 +204,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
expectedErr: errors.New("unable to get isolation group state: could not resolve global drains in an error"),
},
"pathological case - problems with domain drain data - cannot resolve domain": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -224,7 +217,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
},

"pathological case - problems with global drain data - malformed data returned 1": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -244,7 +236,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
expectedErr: errors.New("unable to get isolation group state: could not resolve global drains in an error"),
},
"pathological case - problems with domain drain data - malformed data returned 1": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -260,7 +251,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
expectedErr: errors.New("unable to get isolation group state: could not resolve domain in isolationGroup handler: a failure"),
},
"pathological case - problems with domain drain data - malformed data returned 2": {
availablePollerIsolationGroups: []string{"zone-1", "zone-2"},
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
Expand All @@ -275,26 +265,6 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
expected: nil,
expectedErr: errors.New("unable to get isolation group state: could not resolve domain in isolationGroup handler: %!w(<nil>)"),
},
"pathological case - no available pollers": {
availablePollerIsolationGroups: nil,
cfg: defaultConfig{
IsolationGroupEnabled: func(string) bool { return true },
AllIsolationGroups: func() []string { return []string{"zone-1", "zone-2", "zone-3"} },
},
dcAffordance: func(client *dynamicconfig.MockClient) {
client.EXPECT().GetListValue(
dynamicconfig.DefaultIsolationGroupConfigStoreManagerGlobalMapping,
gomock.Any(),
).Return(nil, nil)
},
domainAffordance: func(mock *cache.MockDomainCache) {
domainResponse := cache.NewDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id", Name: "domain"}, &persistence.DomainConfig{}, true, nil, 0, nil, 0, 0, 0)
mock.EXPECT().GetDomainByID("domain-id").Return(domainResponse, nil)
mock.EXPECT().GetDomainName("domain-id").Return("domain", nil)
mock.EXPECT().GetDomain("domain").Return(domainResponse, nil).AnyTimes()
},
expected: types.IsolationGroupConfiguration{},
},
}

for name, td := range tests {
Expand All @@ -311,7 +281,7 @@ func TestAvailableIsolationGroupsHandler(t *testing.T) {
config: td.cfg,
metricsClient: metrics.NewNoopMetricsClient(),
}
res, err := handler.AvailableIsolationGroupsByDomainID(context.TODO(), "domain-id", "a-tasklist", td.availablePollerIsolationGroups)
res, err := handler.AvailableIsolationGroupsByDomainID(context.TODO(), "domain-id", "a-tasklist")
assert.Equal(t, td.expected, res)
if td.expectedErr != nil {
assert.Equal(t, td.expectedErr.Error(), err.Error())
Expand Down Expand Up @@ -445,7 +415,7 @@ func TestAvailableIsolationGroups(t *testing.T) {
},
}

isolationGroupsSetB := types.IsolationGroupConfiguration{
isolationGroupsWithCDrained := types.IsolationGroupConfiguration{
igA: {
Name: igA,
State: types.IsolationGroupStateHealthy,
Expand All @@ -454,60 +424,44 @@ func TestAvailableIsolationGroups(t *testing.T) {
Name: igB,
State: types.IsolationGroupStateHealthy,
},
igC: {
Name: igC,
State: types.IsolationGroupStateDrained,
},
}

isolationGroupsSetC := types.IsolationGroupConfiguration{
drainedC := types.IsolationGroupConfiguration{
igC: {
Name: igC,
State: types.IsolationGroupStateDrained,
},
}

tests := map[string]struct {
globalIGCfg types.IsolationGroupConfiguration
domainIGCfg types.IsolationGroupConfiguration
availablePollers types.IsolationGroupConfiguration
expected types.IsolationGroupConfiguration
globalIGCfg types.IsolationGroupConfiguration
domainIGCfg types.IsolationGroupConfiguration
expected types.IsolationGroupConfiguration
}{
"default behaviour - no drains - everything should be healthy": {
globalIGCfg: types.IsolationGroupConfiguration{},
domainIGCfg: types.IsolationGroupConfiguration{},
availablePollers: isolationGroupsAllHealthy,
expected: isolationGroupsAllHealthy,
},
"default behaviour - no drains - only one zone is healthy in terms of pollers, should only return that": {
globalIGCfg: types.IsolationGroupConfiguration{},
domainIGCfg: types.IsolationGroupConfiguration{},
availablePollers: types.IsolationGroupConfiguration{
igC: types.IsolationGroupPartition{
Name: igC,
State: types.IsolationGroupStateHealthy,
},
},
expected: types.IsolationGroupConfiguration{
igC: types.IsolationGroupPartition{
Name: igC,
State: types.IsolationGroupStateHealthy,
},
},
expected: isolationGroupsAllHealthy,
},
"default behaviour - one is drained - should return remaining 1/2": {
globalIGCfg: types.IsolationGroupConfiguration{},
availablePollers: isolationGroupsAllHealthy,
domainIGCfg: isolationGroupsSetC, // C is drained
expected: isolationGroupsSetB, // A and B
"default behaviour - drained at domain level": {
globalIGCfg: types.IsolationGroupConfiguration{},
domainIGCfg: drainedC, // C is drained
expected: isolationGroupsWithCDrained, // A and B
},
"default behaviour - one is drained - should return remaining 2/2": {
globalIGCfg: isolationGroupsSetC, // C is drained
availablePollers: isolationGroupsAllHealthy,
domainIGCfg: types.IsolationGroupConfiguration{},
expected: isolationGroupsSetB, // A and B
"default behaviour - drained at global level": {
globalIGCfg: drainedC, // C is drained
domainIGCfg: types.IsolationGroupConfiguration{},
expected: isolationGroupsWithCDrained, // A and B
},
}

for name, td := range tests {
t.Run(name, func(t *testing.T) {
assert.Equal(t, td.expected, availableIG(all, td.availablePollers, td.globalIGCfg, td.domainIGCfg, metrics.NewNoopMetricsClient().Scope(0)))
assert.Equal(t, td.expected, availableIG(all, td.globalIGCfg, td.domainIGCfg))
})
}
}
Expand Down
2 changes: 1 addition & 1 deletion common/isolationgroup/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ type State interface {

// AvailableIsolationGroupsByDomainID returns the available isolation zones for a domain.
// Takes into account global and domain zones
AvailableIsolationGroupsByDomainID(ctx context.Context, domainID string, tasklistName string, availableIsolationGroups []string) (types.IsolationGroupConfiguration, error)
AvailableIsolationGroupsByDomainID(ctx context.Context, domainID string, tasklistName string) (types.IsolationGroupConfiguration, error)
}
8 changes: 4 additions & 4 deletions common/isolationgroup/isolation_group_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading