Alerting: Add backend support for keep_firing_for (#100750)

What is this feature?

This PR introduces a new alert rule configuration option, keep_firing_for (Prometheus documentation).

keep_firing_for prevents alerts from resolving immediately after the alert condition returns to normal. Instead, they transition into a "Recovering" state and are not considered resolved by the Alertmanager. Once the recovery period ends (or after the next evaluation if it is bigger than keep_firing_for), the alert transitions to "Normal" if it doesn't start alerting again:

Before                                          

+----------+     +----------+                    
| Alerting |---->|  Normal  |                    
+----------+     +----------+                    

-----
After

+----------+      +------------+     +----------+
| Alerting |----->| Recovering |---->|  Normal  |
+----------+      +------------+     +----------+                                                 

Why do we need this feature?

This feature prevents flapping alerts by adding a recovery period. This helps avoid false resolutions caused by brief alert
This commit is contained in:
Alexander Akhmetov
2025-03-18 11:24:48 +01:00
committed by GitHub
parent 9491fa1895
commit 695ac91290
31 changed files with 1280 additions and 53 deletions
+59 -10
View File
@@ -813,9 +813,10 @@ func TestIntegrationAlertRuleEditorSettings(t *testing.T) {
require.NoError(t, err)
alertRule := apimodels.PostableExtendedRuleNode{
ApiRuleNode: &apimodels.ApiRuleNode{
For: &interval,
Labels: map[string]string{"label1": "val1"},
Annotations: map[string]string{"annotation1": "val1"},
For: &interval,
KeepFiringFor: &interval,
Labels: map[string]string{"label1": "val1"},
Annotations: map[string]string{"annotation1": "val1"},
},
GrafanaManagedAlert: &apimodels.PostableGrafanaRule{
Title: "AlwaysFiring",
@@ -1088,8 +1089,9 @@ func TestIntegrationRulerRulesFilterByDashboard(t *testing.T) {
Rules: []apimodels.PostableExtendedRuleNode{
{
ApiRuleNode: &apimodels.ApiRuleNode{
For: &interval,
Labels: map[string]string{},
For: &interval,
KeepFiringFor: &interval,
Labels: map[string]string{},
Annotations: map[string]string{
"__dashboardUid__": dashboardUID,
"__panelId__": "1",
@@ -1151,6 +1153,7 @@ func TestIntegrationRulerRulesFilterByDashboard(t *testing.T) {
"rules": [{
"expr": "",
"for": "10s",
"keep_firing_for": "10s",
"annotations": {
"__dashboardUid__": "%s",
"__panelId__": "1"
@@ -1197,6 +1200,7 @@ func TestIntegrationRulerRulesFilterByDashboard(t *testing.T) {
}, {
"expr": "",
"for":"0s",
"keep_firing_for": "0s",
"grafana_alert": {
"title": "AlwaysFiringButSilenced",
"condition": "A",
@@ -1247,6 +1251,7 @@ func TestIntegrationRulerRulesFilterByDashboard(t *testing.T) {
"rules": [{
"expr": "",
"for": "10s",
"keep_firing_for": "10s",
"annotations": {
"__dashboardUid__": "%s",
"__panelId__": "1"
@@ -1556,7 +1561,8 @@ func TestIntegrationRuleCreate(t *testing.T) {
Rules: []apimodels.PostableExtendedRuleNode{
{
ApiRuleNode: &apimodels.ApiRuleNode{
For: util.Pointer(model.Duration(2 * time.Minute)),
For: util.Pointer(model.Duration(2 * time.Minute)),
KeepFiringFor: util.Pointer(model.Duration(1 * time.Minute)),
Labels: map[string]string{
"foo🙂": "bar",
"_bar1": "baz🙂",
@@ -1589,7 +1595,8 @@ func TestIntegrationRuleCreate(t *testing.T) {
Rules: []apimodels.GettableExtendedRuleNode{
{
ApiRuleNode: &apimodels.ApiRuleNode{
For: util.Pointer(model.Duration(2 * time.Minute)),
For: util.Pointer(model.Duration(2 * time.Minute)),
KeepFiringFor: util.Pointer(model.Duration(1 * time.Minute)),
Labels: map[string]string{
"foo🙂": "bar",
"_bar1": "baz🙂",
@@ -1733,6 +1740,27 @@ func TestIntegrationRuleUpdate(t *testing.T) {
require.Equal(t, http.StatusAccepted, status)
require.Equal(t, expected, *getGroup.Rules[0].ApiRuleNode.For)
})
t.Run("should be able to reset 'keep_firing_for' to 0", func(t *testing.T) {
group := generateAlertRuleGroup(1, alertRuleGen())
keepFiringFor := model.Duration(10 * time.Second)
group.Rules[0].ApiRuleNode.KeepFiringFor = &keepFiringFor
_, status, body := client.PostRulesGroupWithStatus(t, folderUID, &group, false)
require.Equalf(t, http.StatusAccepted, status, "failed to post rule group. Response: %s", body)
getGroup, _ := client.GetRulesGroup(t, folderUID, group.Name)
require.Equal(t, keepFiringFor, *getGroup.Rules[0].ApiRuleNode.KeepFiringFor)
group = convertGettableRuleGroupToPostable(getGroup.GettableRuleGroupConfig)
newKeepFiringFor := model.Duration(0)
group.Rules[0].ApiRuleNode.KeepFiringFor = &newKeepFiringFor
_, status, body = client.PostRulesGroupWithStatus(t, folderUID, &group, false)
require.Equalf(t, http.StatusAccepted, status, "failed to post rule group. Response: %s", body)
getGroup, _ = client.GetRulesGroup(t, folderUID, group.Name)
require.Equal(t, newKeepFiringFor, *getGroup.Rules[0].ApiRuleNode.KeepFiringFor)
})
t.Run("when data source missing", func(t *testing.T) {
var groupName string
{
@@ -2552,6 +2580,7 @@ func TestIntegrationQuota(t *testing.T) {
{
"expr":"",
"for": "2m",
"keep_firing_for": "0s",
"grafana_alert":{
"title":"Updated alert rule",
"condition":"A",
@@ -2660,6 +2689,7 @@ func TestIntegrationDeleteFolderWithRules(t *testing.T) {
{
"expr": "",
"for": "2m",
"keep_firing_for": "0s",
"labels": {
"label1": "val1"
},
@@ -3146,6 +3176,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
},
"expr":"",
"for": "1m",
"keep_firing_for": "0s",
"labels": {
"label1": "val1"
},
@@ -3194,6 +3225,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
{
"expr":"",
"for": "0s",
"keep_firing_for": "0s",
"grafana_alert":{
"title":"AlwaysFiringButSilenced",
"condition":"A",
@@ -3250,6 +3282,9 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
interval, err := model.ParseDuration("30s")
require.NoError(t, err)
keepFiringFor, err := model.ParseDuration("10s")
require.NoError(t, err)
rules := apimodels.PostableRuleGroupConfig{
Name: "arulegroup",
Rules: []apimodels.PostableExtendedRuleNode{
@@ -3306,7 +3341,8 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
},
{
ApiRuleNode: &apimodels.ApiRuleNode{
For: &interval,
For: &interval,
KeepFiringFor: &keepFiringFor,
Labels: map[string]string{
"label1": "val42",
"foo": "bar",
@@ -3537,6 +3573,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
},
"expr":"",
"for": "1m",
"keep_firing_for": "0s",
"labels": {
"label1": "val1"
},
@@ -3585,6 +3622,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
{
"expr":"",
"for": "0s",
"keep_firing_for": "0s",
"grafana_alert":{
"title":"AlwaysFiringButSilenced",
"condition":"A",
@@ -3639,12 +3677,16 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
forValue, err := model.ParseDuration("30s")
require.NoError(t, err)
keepFiringForValue, err := model.ParseDuration("5s")
require.NoError(t, err)
rules := apimodels.PostableRuleGroupConfig{
Name: "arulegroup",
Rules: []apimodels.PostableExtendedRuleNode{
{
ApiRuleNode: &apimodels.ApiRuleNode{
For: &forValue,
For: &forValue,
KeepFiringFor: &keepFiringForValue,
Labels: map[string]string{
// delete foo label
"label1": "val1", // update label value
@@ -3719,6 +3761,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
},
"expr":"",
"for": "30s",
"keep_firing_for": "5s",
"labels": {
"label1": "val1",
"label2": "val2"
@@ -3776,12 +3819,16 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
forValue, err := model.ParseDuration("30s")
require.NoError(t, err)
keepFiringForValue, err := model.ParseDuration("15s")
require.NoError(t, err)
rules := apimodels.PostableRuleGroupConfig{
Name: "arulegroup",
Rules: []apimodels.PostableExtendedRuleNode{
{
ApiRuleNode: &apimodels.ApiRuleNode{
For: &forValue,
For: &forValue,
KeepFiringFor: &keepFiringForValue,
},
GrafanaManagedAlert: &apimodels.PostableGrafanaRule{
UID: ruleUID, // Including the UID in the payload makes the endpoint update the existing rule.
@@ -3841,6 +3888,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
{
"expr":"",
"for": "30s",
"keep_firing_for": "15s",
"grafana_alert":{
"title":"AlwaysNormal",
"condition":"A",
@@ -3938,6 +3986,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
{
"expr":"",
"for": "30s",
"keep_firing_for": "15s",
"grafana_alert":{
"title":"AlwaysNormal",
"condition":"A",