Alerting: Add backend support for keep_firing_for (#100750)
What is this feature? This PR introduces a new alert rule configuration option, keep_firing_for (Prometheus documentation). keep_firing_for prevents alerts from resolving immediately after the alert condition returns to normal. Instead, they transition into a "Recovering" state and are not considered resolved by the Alertmanager. Once the recovery period ends (or after the next evaluation if it is bigger than keep_firing_for), the alert transitions to "Normal" if it doesn't start alerting again: Before +----------+ +----------+ | Alerting |---->| Normal | +----------+ +----------+ ----- After +----------+ +------------+ +----------+ | Alerting |----->| Recovering |---->| Normal | +----------+ +------------+ +----------+ Why do we need this feature? This feature prevents flapping alerts by adding a recovery period. This helps avoid false resolutions caused by brief alert
This commit is contained in:
committed by
GitHub
parent
9491fa1895
commit
695ac91290
@@ -813,9 +813,10 @@ func TestIntegrationAlertRuleEditorSettings(t *testing.T) {
|
||||
require.NoError(t, err)
|
||||
alertRule := apimodels.PostableExtendedRuleNode{
|
||||
ApiRuleNode: &apimodels.ApiRuleNode{
|
||||
For: &interval,
|
||||
Labels: map[string]string{"label1": "val1"},
|
||||
Annotations: map[string]string{"annotation1": "val1"},
|
||||
For: &interval,
|
||||
KeepFiringFor: &interval,
|
||||
Labels: map[string]string{"label1": "val1"},
|
||||
Annotations: map[string]string{"annotation1": "val1"},
|
||||
},
|
||||
GrafanaManagedAlert: &apimodels.PostableGrafanaRule{
|
||||
Title: "AlwaysFiring",
|
||||
@@ -1088,8 +1089,9 @@ func TestIntegrationRulerRulesFilterByDashboard(t *testing.T) {
|
||||
Rules: []apimodels.PostableExtendedRuleNode{
|
||||
{
|
||||
ApiRuleNode: &apimodels.ApiRuleNode{
|
||||
For: &interval,
|
||||
Labels: map[string]string{},
|
||||
For: &interval,
|
||||
KeepFiringFor: &interval,
|
||||
Labels: map[string]string{},
|
||||
Annotations: map[string]string{
|
||||
"__dashboardUid__": dashboardUID,
|
||||
"__panelId__": "1",
|
||||
@@ -1151,6 +1153,7 @@ func TestIntegrationRulerRulesFilterByDashboard(t *testing.T) {
|
||||
"rules": [{
|
||||
"expr": "",
|
||||
"for": "10s",
|
||||
"keep_firing_for": "10s",
|
||||
"annotations": {
|
||||
"__dashboardUid__": "%s",
|
||||
"__panelId__": "1"
|
||||
@@ -1197,6 +1200,7 @@ func TestIntegrationRulerRulesFilterByDashboard(t *testing.T) {
|
||||
}, {
|
||||
"expr": "",
|
||||
"for":"0s",
|
||||
"keep_firing_for": "0s",
|
||||
"grafana_alert": {
|
||||
"title": "AlwaysFiringButSilenced",
|
||||
"condition": "A",
|
||||
@@ -1247,6 +1251,7 @@ func TestIntegrationRulerRulesFilterByDashboard(t *testing.T) {
|
||||
"rules": [{
|
||||
"expr": "",
|
||||
"for": "10s",
|
||||
"keep_firing_for": "10s",
|
||||
"annotations": {
|
||||
"__dashboardUid__": "%s",
|
||||
"__panelId__": "1"
|
||||
@@ -1556,7 +1561,8 @@ func TestIntegrationRuleCreate(t *testing.T) {
|
||||
Rules: []apimodels.PostableExtendedRuleNode{
|
||||
{
|
||||
ApiRuleNode: &apimodels.ApiRuleNode{
|
||||
For: util.Pointer(model.Duration(2 * time.Minute)),
|
||||
For: util.Pointer(model.Duration(2 * time.Minute)),
|
||||
KeepFiringFor: util.Pointer(model.Duration(1 * time.Minute)),
|
||||
Labels: map[string]string{
|
||||
"foo🙂": "bar",
|
||||
"_bar1": "baz🙂",
|
||||
@@ -1589,7 +1595,8 @@ func TestIntegrationRuleCreate(t *testing.T) {
|
||||
Rules: []apimodels.GettableExtendedRuleNode{
|
||||
{
|
||||
ApiRuleNode: &apimodels.ApiRuleNode{
|
||||
For: util.Pointer(model.Duration(2 * time.Minute)),
|
||||
For: util.Pointer(model.Duration(2 * time.Minute)),
|
||||
KeepFiringFor: util.Pointer(model.Duration(1 * time.Minute)),
|
||||
Labels: map[string]string{
|
||||
"foo🙂": "bar",
|
||||
"_bar1": "baz🙂",
|
||||
@@ -1733,6 +1740,27 @@ func TestIntegrationRuleUpdate(t *testing.T) {
|
||||
require.Equal(t, http.StatusAccepted, status)
|
||||
require.Equal(t, expected, *getGroup.Rules[0].ApiRuleNode.For)
|
||||
})
|
||||
|
||||
t.Run("should be able to reset 'keep_firing_for' to 0", func(t *testing.T) {
|
||||
group := generateAlertRuleGroup(1, alertRuleGen())
|
||||
keepFiringFor := model.Duration(10 * time.Second)
|
||||
group.Rules[0].ApiRuleNode.KeepFiringFor = &keepFiringFor
|
||||
|
||||
_, status, body := client.PostRulesGroupWithStatus(t, folderUID, &group, false)
|
||||
require.Equalf(t, http.StatusAccepted, status, "failed to post rule group. Response: %s", body)
|
||||
getGroup, _ := client.GetRulesGroup(t, folderUID, group.Name)
|
||||
require.Equal(t, keepFiringFor, *getGroup.Rules[0].ApiRuleNode.KeepFiringFor)
|
||||
|
||||
group = convertGettableRuleGroupToPostable(getGroup.GettableRuleGroupConfig)
|
||||
newKeepFiringFor := model.Duration(0)
|
||||
group.Rules[0].ApiRuleNode.KeepFiringFor = &newKeepFiringFor
|
||||
_, status, body = client.PostRulesGroupWithStatus(t, folderUID, &group, false)
|
||||
require.Equalf(t, http.StatusAccepted, status, "failed to post rule group. Response: %s", body)
|
||||
|
||||
getGroup, _ = client.GetRulesGroup(t, folderUID, group.Name)
|
||||
require.Equal(t, newKeepFiringFor, *getGroup.Rules[0].ApiRuleNode.KeepFiringFor)
|
||||
})
|
||||
|
||||
t.Run("when data source missing", func(t *testing.T) {
|
||||
var groupName string
|
||||
{
|
||||
@@ -2552,6 +2580,7 @@ func TestIntegrationQuota(t *testing.T) {
|
||||
{
|
||||
"expr":"",
|
||||
"for": "2m",
|
||||
"keep_firing_for": "0s",
|
||||
"grafana_alert":{
|
||||
"title":"Updated alert rule",
|
||||
"condition":"A",
|
||||
@@ -2660,6 +2689,7 @@ func TestIntegrationDeleteFolderWithRules(t *testing.T) {
|
||||
{
|
||||
"expr": "",
|
||||
"for": "2m",
|
||||
"keep_firing_for": "0s",
|
||||
"labels": {
|
||||
"label1": "val1"
|
||||
},
|
||||
@@ -3146,6 +3176,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
|
||||
},
|
||||
"expr":"",
|
||||
"for": "1m",
|
||||
"keep_firing_for": "0s",
|
||||
"labels": {
|
||||
"label1": "val1"
|
||||
},
|
||||
@@ -3194,6 +3225,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
|
||||
{
|
||||
"expr":"",
|
||||
"for": "0s",
|
||||
"keep_firing_for": "0s",
|
||||
"grafana_alert":{
|
||||
"title":"AlwaysFiringButSilenced",
|
||||
"condition":"A",
|
||||
@@ -3250,6 +3282,9 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
|
||||
interval, err := model.ParseDuration("30s")
|
||||
require.NoError(t, err)
|
||||
|
||||
keepFiringFor, err := model.ParseDuration("10s")
|
||||
require.NoError(t, err)
|
||||
|
||||
rules := apimodels.PostableRuleGroupConfig{
|
||||
Name: "arulegroup",
|
||||
Rules: []apimodels.PostableExtendedRuleNode{
|
||||
@@ -3306,7 +3341,8 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
|
||||
},
|
||||
{
|
||||
ApiRuleNode: &apimodels.ApiRuleNode{
|
||||
For: &interval,
|
||||
For: &interval,
|
||||
KeepFiringFor: &keepFiringFor,
|
||||
Labels: map[string]string{
|
||||
"label1": "val42",
|
||||
"foo": "bar",
|
||||
@@ -3537,6 +3573,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
|
||||
},
|
||||
"expr":"",
|
||||
"for": "1m",
|
||||
"keep_firing_for": "0s",
|
||||
"labels": {
|
||||
"label1": "val1"
|
||||
},
|
||||
@@ -3585,6 +3622,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
|
||||
{
|
||||
"expr":"",
|
||||
"for": "0s",
|
||||
"keep_firing_for": "0s",
|
||||
"grafana_alert":{
|
||||
"title":"AlwaysFiringButSilenced",
|
||||
"condition":"A",
|
||||
@@ -3639,12 +3677,16 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
|
||||
forValue, err := model.ParseDuration("30s")
|
||||
require.NoError(t, err)
|
||||
|
||||
keepFiringForValue, err := model.ParseDuration("5s")
|
||||
require.NoError(t, err)
|
||||
|
||||
rules := apimodels.PostableRuleGroupConfig{
|
||||
Name: "arulegroup",
|
||||
Rules: []apimodels.PostableExtendedRuleNode{
|
||||
{
|
||||
ApiRuleNode: &apimodels.ApiRuleNode{
|
||||
For: &forValue,
|
||||
For: &forValue,
|
||||
KeepFiringFor: &keepFiringForValue,
|
||||
Labels: map[string]string{
|
||||
// delete foo label
|
||||
"label1": "val1", // update label value
|
||||
@@ -3719,6 +3761,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
|
||||
},
|
||||
"expr":"",
|
||||
"for": "30s",
|
||||
"keep_firing_for": "5s",
|
||||
"labels": {
|
||||
"label1": "val1",
|
||||
"label2": "val2"
|
||||
@@ -3776,12 +3819,16 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
|
||||
forValue, err := model.ParseDuration("30s")
|
||||
require.NoError(t, err)
|
||||
|
||||
keepFiringForValue, err := model.ParseDuration("15s")
|
||||
require.NoError(t, err)
|
||||
|
||||
rules := apimodels.PostableRuleGroupConfig{
|
||||
Name: "arulegroup",
|
||||
Rules: []apimodels.PostableExtendedRuleNode{
|
||||
{
|
||||
ApiRuleNode: &apimodels.ApiRuleNode{
|
||||
For: &forValue,
|
||||
For: &forValue,
|
||||
KeepFiringFor: &keepFiringForValue,
|
||||
},
|
||||
GrafanaManagedAlert: &apimodels.PostableGrafanaRule{
|
||||
UID: ruleUID, // Including the UID in the payload makes the endpoint update the existing rule.
|
||||
@@ -3841,6 +3888,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
|
||||
{
|
||||
"expr":"",
|
||||
"for": "30s",
|
||||
"keep_firing_for": "15s",
|
||||
"grafana_alert":{
|
||||
"title":"AlwaysNormal",
|
||||
"condition":"A",
|
||||
@@ -3938,6 +3986,7 @@ func TestIntegrationAlertRuleCRUD(t *testing.T) {
|
||||
{
|
||||
"expr":"",
|
||||
"for": "30s",
|
||||
"keep_firing_for": "15s",
|
||||
"grafana_alert":{
|
||||
"title":"AlwaysNormal",
|
||||
"condition":"A",
|
||||
|
||||
Reference in New Issue
Block a user