From b49c5329991d4bb514b79992cc6d4874795e0967 Mon Sep 17 00:00:00 2001 From: Alexander Akhmetov Date: Thu, 3 Apr 2025 09:40:45 +0100 Subject: [PATCH] Alerting: Fix state transition from Recovering back to Alerting (#103286) What is this feature? This PR fixes a state transition issue where alerts transitioning from the Recovering state back to the Alerting state incorrectly entered the Pending state first if the rule had a For duration configured. Why do we need this feature? When an alert goes from Alerting to Recovering (when using the Keep firing for) and then back to Alerting, the existing logic would incorrectly put the alert into Pending state while it should be alerting and still sending notifications to the Alertmanager. --- .../ngalert/state/manager_private_test.go | 523 +++++++++++++++++- pkg/services/ngalert/state/manager_test.go | 34 ++ pkg/services/ngalert/state/state.go | 5 +- 3 files changed, 559 insertions(+), 3 deletions(-) diff --git a/pkg/services/ngalert/state/manager_private_test.go b/pkg/services/ngalert/state/manager_private_test.go index da8ee1e6c06..e440ac2a1c3 100644 --- a/pkg/services/ngalert/state/manager_private_test.go +++ b/pkg/services/ngalert/state/manager_private_test.go @@ -157,6 +157,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) { t2 := tN(2) t3 := tN(3) t4 := tN(4) + t5 := tN(5) baseRule := &ngmodels.AlertRule{ OrgID: 1, @@ -856,7 +857,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) { }, }, { - desc: "t1[1alerting] t2[1:normal] t3[1:alerting] and 'keep_firing_for'>0 at t2,t3", + desc: "t1[1:alerting] t2[1:normal] t3[1:alerting] and 'keep_firing_for'>0 at t2,t3", alertRule: baseRuleWith(ngmodels.RuleMuts.WithKeepFiringForNTimes(1), ngmodels.RuleMuts.WithFor(0)), results: map[time.Time]eval.Results{ t1: { @@ -914,6 +915,68 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) { }, }, }, + { + desc: "t1[1:alerting] t2[1:alerting] t3[1:normal] t4[1:alerting] and 'keep_firing_for'>0 and 'for'>0 at t2,t3,t4", + alertRule: baseRuleWith(ngmodels.RuleMuts.WithKeepFiringForNTimes(1), ngmodels.RuleMuts.WithForNTimes(1)), + results: map[time.Time]eval.Results{ + t1: { + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)), + }, + t2: { + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)), + }, + t3: { + newResult(eval.WithState(eval.Normal), eval.WithLabels(labels1)), + }, + t4: { + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)), + }, + }, + expectedTransitions: map[time.Time][]StateTransition{ + t2: { + { + PreviousState: eval.Pending, + State: &State{ + Labels: labels["system + rule + labels1"], + State: eval.Alerting, + LatestResult: newEvaluation(t2, eval.Alerting), + StartsAt: t2, + EndsAt: t2.Add(ResendDelay * 4), + LastEvaluationTime: t2, + LastSentAt: &t2, + }, + }, + }, + t3: { + { + PreviousState: eval.Alerting, + State: &State{ + Labels: labels["system + rule + labels1"], + State: eval.Recovering, + LatestResult: newEvaluation(t3, eval.Normal), + StartsAt: t3, + EndsAt: t3.Add(ResendDelay * 4), + LastEvaluationTime: t3, + LastSentAt: &t2, + }, + }, + }, + t4: { + { + PreviousState: eval.Recovering, + State: &State{ + Labels: labels["system + rule + labels1"], + State: eval.Alerting, + LatestResult: newEvaluation(t4, eval.Alerting), + StartsAt: t4, + EndsAt: t4.Add(ResendDelay * 4), + LastEvaluationTime: t4, + LastSentAt: &t2, + }, + }, + }, + }, + }, { desc: "t1[1:alerting] t2[NoData] t3[NoData] at t2,t3", alertRule: baseRule, @@ -2290,6 +2353,244 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) { }, }, }, + { + desc: "t1[{}:normal] t2[NoData] t3[NoData] t4[{}:normal] t5[NoData] and 'keep_firing_for'=2 and 'for'=1 at t3,t4,t5", + ruleMutators: []ngmodels.AlertRuleMutator{ngmodels.RuleMuts.WithKeepFiringForNTimes(2), ngmodels.RuleMuts.WithForNTimes(1)}, + results: map[time.Time]eval.Results{ + t1: { + newResult(eval.WithState(eval.Normal)), + }, + t2: { + newResult(eval.WithState(eval.NoData), eval.WithLabels(noDataLabels)), + }, + t3: { + newResult(eval.WithState(eval.NoData), eval.WithLabels(noDataLabels)), + }, + t4: { + newResult(eval.WithState(eval.Normal)), + }, + t5: { + newResult(eval.WithState(eval.NoData), eval.WithLabels(noDataLabels)), + }, + }, + expectedTransitions: map[ngmodels.NoDataState]map[time.Time][]StateTransition{ + ngmodels.NoData: { + t3: { + { + PreviousState: eval.NoData, + State: &State{ + Labels: labels["system + rule + no-data"], + Annotations: baseRule.Annotations, + State: eval.NoData, + LatestResult: newEvaluation(t3, eval.NoData), + StartsAt: t2, + EndsAt: t3.Add(ResendDelay * 4), + LastEvaluationTime: t3, + LastSentAt: &t2, + }, + }, + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule"], + Annotations: baseRule.Annotations, + State: eval.Normal, + StateReason: ngmodels.StateReasonMissingSeries, + LatestResult: newEvaluation(t1, eval.Normal), + StartsAt: t1, + EndsAt: t3, + LastEvaluationTime: t3, + }, + }, + }, + t4: { + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule"], + Annotations: baseRule.Annotations, + State: eval.Normal, + LatestResult: newEvaluation(t4, eval.Normal), + StartsAt: t4, + EndsAt: t4, + LastEvaluationTime: t4, + }, + }, + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule"], + Annotations: baseRule.Annotations, + State: eval.Normal, + LatestResult: newEvaluation(t4, eval.Normal), + StartsAt: t4, + EndsAt: t4, + LastEvaluationTime: t4, + }, + }, + }, + t5: { + { + PreviousState: eval.NoData, + State: &State{ + Labels: labels["system + rule + no-data"], + Annotations: baseRule.Annotations, + State: eval.NoData, + LatestResult: newEvaluation(t5, eval.NoData), + StartsAt: t2, + EndsAt: t5.Add(ResendDelay * 4), + LastEvaluationTime: t5, + LastSentAt: &t5, + }, + }, + }, + }, + ngmodels.Alerting: { + t3: { + { + PreviousState: eval.Pending, + PreviousStateReason: eval.NoData.String(), + State: &State{ + Labels: labels["system + rule"], + Annotations: mergeLabels(baseRule.Annotations, noDataAnnotations), + State: eval.Alerting, + StateReason: eval.NoData.String(), + LatestResult: newEvaluation(t3, eval.NoData), + StartsAt: t3, + EndsAt: t3.Add(ResendDelay * 4), + LastEvaluationTime: t3, + LastSentAt: &t3, + }, + }, + }, + t4: { + { + PreviousState: eval.Alerting, + PreviousStateReason: eval.NoData.String(), + State: &State{ + Labels: labels["system + rule"], + State: eval.Recovering, + LatestResult: newEvaluation(t4, eval.Normal), + StartsAt: t4, + EndsAt: t4.Add(ResendDelay * 4), + LastEvaluationTime: t4, + LastSentAt: &t3, + }, + }, + }, + t5: { + { + PreviousState: eval.Recovering, + State: &State{ + Labels: labels["system + rule"], + State: eval.Alerting, + StateReason: eval.NoData.String(), + LatestResult: newEvaluation(t5, eval.NoData), + StartsAt: t5, + EndsAt: t5.Add(ResendDelay * 4), + LastEvaluationTime: t5, + LastSentAt: &t3, + Annotations: mergeLabels(baseRule.Annotations, noDataAnnotations), + }, + }, + }, + }, + ngmodels.OK: { + t3: { + { + PreviousState: eval.Normal, + PreviousStateReason: eval.NoData.String(), + State: &State{ + Labels: labels["system + rule"], + State: eval.Normal, + StateReason: eval.NoData.String(), + LatestResult: newEvaluation(t3, eval.NoData), + StartsAt: t1, + EndsAt: t1, + LastEvaluationTime: t3, + Annotations: mergeLabels(baseRule.Annotations, noDataAnnotations), + }, + }, + }, + t4: { + { + PreviousState: eval.Normal, + PreviousStateReason: eval.NoData.String(), + State: &State{ + Labels: labels["system + rule"], + State: eval.Normal, + LatestResult: newEvaluation(t4, eval.Normal), + StartsAt: t1, + EndsAt: t1, + LastEvaluationTime: t4, + }, + }, + }, + t5: { + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule"], + State: eval.Normal, + StateReason: eval.NoData.String(), + LatestResult: newEvaluation(t5, eval.NoData), + StartsAt: t1, + EndsAt: t1, + LastEvaluationTime: t5, + Annotations: mergeLabels(baseRule.Annotations, noDataAnnotations), + }, + }, + }, + }, + ngmodels.KeepLast: { + t3: { + { + PreviousState: eval.Normal, + PreviousStateReason: ngmodels.ConcatReasons(eval.NoData.String(), ngmodels.StateReasonKeepLast), + State: &State{ + Labels: labels["system + rule"], + Annotations: mergeLabels(baseRule.Annotations, noDataAnnotations), + State: eval.Normal, + StateReason: ngmodels.ConcatReasons(eval.NoData.String(), ngmodels.StateReasonKeepLast), + LatestResult: newEvaluation(t3, eval.NoData), + StartsAt: t1, + EndsAt: t1, + LastEvaluationTime: t3, + }, + }, + }, + t4: { + { + PreviousState: eval.Normal, + PreviousStateReason: ngmodels.ConcatReasons(eval.NoData.String(), ngmodels.StateReasonKeepLast), + State: &State{ + Labels: labels["system + rule"], + State: eval.Normal, + LatestResult: newEvaluation(t4, eval.Normal), + StartsAt: t1, + EndsAt: t1, + LastEvaluationTime: t4, + }, + }, + }, + t5: { + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule"], + State: eval.Normal, + StateReason: ngmodels.ConcatReasons(eval.NoData.String(), ngmodels.StateReasonKeepLast), + LatestResult: newEvaluation(t5, eval.NoData), + StartsAt: t1, + EndsAt: t1, + LastEvaluationTime: t5, + Annotations: mergeLabels(baseRule.Annotations, noDataAnnotations), + }, + }, + }, + }, + }, + }, { desc: "t1[NoData] t2[1:normal] t3[1:normal] at t3", results: map[time.Time]eval.Results{ @@ -3532,6 +3833,226 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) { }, }, }, + { + desc: "t1[{}:QueryError] t2[{}:QueryError] t3[{}:normal] t4[{}:QueryError] and 'keep_firing_for'=2 and 'for'=1 at t2,t3,t4", + ruleMutators: []ngmodels.AlertRuleMutator{ngmodels.RuleMuts.WithKeepFiringForNTimes(2), ngmodels.RuleMuts.WithForNTimes(1)}, + results: map[time.Time]eval.Results{ + t1: { + newResult(eval.WithError(datasourceError)), + }, + t2: { + newResult(eval.WithError(datasourceError)), + }, + t3: { + newResult(eval.WithState(eval.Normal)), + }, + t4: { + newResult(eval.WithError(datasourceError)), + }, + }, + expectedTransitions: map[ngmodels.ExecutionErrorState]map[time.Time][]StateTransition{ + ngmodels.ErrorErrState: { + t2: { + { + PreviousState: eval.Error, + State: &State{ + CacheID: labels["system + rule"].Fingerprint(), + Labels: labels["system + rule + datasource-error"], + Error: datasourceError, + State: eval.Error, + LatestResult: newEvaluation(t2, eval.Error), + StartsAt: t1, + EndsAt: t2.Add(ResendDelay * 4), + LastEvaluationTime: t2, + LastSentAt: &t1, + Annotations: mergeLabels(baseRule.Annotations, data.Labels{ + "Error": datasourceError.Error(), + }), + }, + }, + }, + t3: { + { + PreviousState: eval.Error, + State: &State{ + Labels: labels["system + rule"], + State: eval.Normal, + LatestResult: newEvaluation(t3, eval.Normal), + StartsAt: t3, + EndsAt: t3, + LastEvaluationTime: t3, + LastSentAt: &t1, + }, + }, + }, + t4: { + { + PreviousState: eval.Normal, + State: &State{ + CacheID: labels["system + rule"].Fingerprint(), + Labels: labels["system + rule + datasource-error"], + State: eval.Error, + LatestResult: newEvaluation(t4, eval.Error), + Error: datasourceError, + StartsAt: t4, + EndsAt: t4.Add(ResendDelay * 4), + LastEvaluationTime: t4, + LastSentAt: &t4, + Annotations: mergeLabels(baseRule.Annotations, data.Labels{ + "Error": datasourceError.Error(), + }), + }, + }, + }, + }, + ngmodels.AlertingErrState: { + t2: { + { + PreviousState: eval.Pending, + PreviousStateReason: eval.Error.String(), + State: &State{ + Labels: labels["system + rule"], + State: eval.Alerting, + StateReason: eval.Error.String(), + Error: datasourceError, + LatestResult: newEvaluation(t2, eval.Error), + StartsAt: t2, + EndsAt: t2.Add(ResendDelay * 4), + LastEvaluationTime: t2, + LastSentAt: &t2, + Annotations: datasourceErrorAnnotations, + }, + }, + }, + t3: { + { + PreviousState: eval.Alerting, + PreviousStateReason: eval.Error.String(), + State: &State{ + Labels: labels["system + rule"], + State: eval.Recovering, + LatestResult: newEvaluation(t3, eval.Normal), + StartsAt: t3, + EndsAt: t3.Add(ResendDelay * 4), + LastEvaluationTime: t3, + LastSentAt: &t2, + }, + }, + }, + t4: { + { + PreviousState: eval.Recovering, + State: &State{ + Labels: labels["system + rule"], + State: eval.Alerting, + StateReason: eval.Error.String(), + Error: datasourceError, + LatestResult: newEvaluation(t4, eval.Error), + StartsAt: t4, + EndsAt: t4.Add(ResendDelay * 4), + LastEvaluationTime: t4, + LastSentAt: &t2, + Annotations: datasourceErrorAnnotations, + }, + }, + }, + }, + ngmodels.OkErrState: { + t2: { + { + PreviousState: eval.Normal, + PreviousStateReason: eval.Error.String(), + State: &State{ + Labels: labels["system + rule"], + State: eval.Normal, + StateReason: eval.Error.String(), + LatestResult: newEvaluation(t2, eval.Error), + StartsAt: t1, + EndsAt: t1, + LastEvaluationTime: t2, + Annotations: datasourceErrorAnnotations, + }, + }, + }, + t3: { + { + PreviousState: eval.Normal, + PreviousStateReason: eval.Error.String(), + State: &State{ + Labels: labels["system + rule"], + State: eval.Normal, + LatestResult: newEvaluation(t3, eval.Normal), + StartsAt: t1, + EndsAt: t1, + LastEvaluationTime: t3, + }, + }, + }, + t4: { + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule"], + State: eval.Normal, + StateReason: eval.Error.String(), + LatestResult: newEvaluation(t4, eval.Error), + StartsAt: t1, + EndsAt: t1, + LastEvaluationTime: t4, + Annotations: datasourceErrorAnnotations, + }, + }, + }, + }, + ngmodels.KeepLastErrState: { + t2: { + { + PreviousState: eval.Normal, + PreviousStateReason: ngmodels.ConcatReasons(eval.Error.String(), ngmodels.StateReasonKeepLast), + State: &State{ + Labels: labels["system + rule"], + State: eval.Normal, + StateReason: ngmodels.ConcatReasons(eval.Error.String(), ngmodels.StateReasonKeepLast), + LatestResult: newEvaluation(t2, eval.Error), + StartsAt: t1, + EndsAt: t1, + LastEvaluationTime: t2, + Annotations: datasourceErrorAnnotations, + }, + }, + }, + t3: { + { + PreviousState: eval.Normal, + PreviousStateReason: ngmodels.ConcatReasons(eval.Error.String(), ngmodels.StateReasonKeepLast), + State: &State{ + Labels: labels["system + rule"], + State: eval.Normal, + LatestResult: newEvaluation(t3, eval.Normal), + StartsAt: t1, + EndsAt: t1, + LastEvaluationTime: t3, + }, + }, + }, + t4: { + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule"], + State: eval.Normal, + StateReason: ngmodels.ConcatReasons(eval.Error.String(), ngmodels.StateReasonKeepLast), + LatestResult: newEvaluation(t4, eval.Error), + StartsAt: t1, + EndsAt: t1, + LastEvaluationTime: t4, + Annotations: datasourceErrorAnnotations, + }, + }, + }, + }, + }, + }, { desc: "t1[{}:normal] t2[QueryError] at t2", results: map[time.Time]eval.Results{ diff --git a/pkg/services/ngalert/state/manager_test.go b/pkg/services/ngalert/state/manager_test.go index eb71be96f8d..29f6cab577c 100644 --- a/pkg/services/ngalert/state/manager_test.go +++ b/pkg/services/ngalert/state/manager_test.go @@ -1201,6 +1201,40 @@ func TestProcessEvalResults(t *testing.T) { }, }, }, + { + desc: "normal -> alerting (pending) -> alerting -> normal (recovering) -> alerting when both KeepFiringFor and For are set", + alertRule: baseRuleWith(m.WithKeepFiringForNTimes(2), m.WithForNTimes(1)), + evalResults: map[time.Time]eval.Results{ + t1: { + newResult(eval.WithState(eval.Normal), eval.WithLabels(labels1)), + }, + t2: { + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)), + }, + t3: { + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)), + }, + tn(4): { + newResult(eval.WithState(eval.Normal), eval.WithLabels(labels1)), + }, + tn(5): { + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)), + }, + }, + expectedAnnotations: 4, + expectedStates: []*state.State{ + { + Labels: labels["system + rule + labels1"], + ResultFingerprint: labels1.Fingerprint(), + State: eval.Alerting, + LatestResult: newEvaluation(tn(5), eval.Alerting), + StartsAt: tn(5), + EndsAt: tn(5).Add(state.ResendDelay * 4), + LastEvaluationTime: tn(5), + LastSentAt: util.Pointer(t3), + }, + }, + }, } for _, tc := range testCases { diff --git a/pkg/services/ngalert/state/state.go b/pkg/services/ngalert/state/state.go index d233756d189..dbc42336665 100644 --- a/pkg/services/ngalert/state/state.go +++ b/pkg/services/ngalert/state/state.go @@ -417,8 +417,9 @@ func resultAlerting(state *State, rule *models.AlertRule, result eval.Result, lo } default: nextEndsAt := nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt) - if rule.For > 0 { - // If the alert rule has a For duration that should be observed then the state should be set to Pending + if state.State != eval.Recovering && rule.For > 0 { + // If the alert rule has a For duration that should be observed then the state should be set to Pending. + // If the alert is currently in the Recovering state then we skip Pending and set it directly to Alerting. logger.Debug("Changing state", "previous_state", state.State,