Alerting: Expose info about notification delivery errors in a new /receivers endpoint (#55429) (#56899)

* (WIP) switch to fork AM, first implementation of the API, generate spec

* get receivers avoiding race conditions

* use latest version of our forked AM, tests

* make linter happy, delete TODO comment

* update number of expected paths to += 2

* delete unused endpoint code, code review comments, tests

* Update pkg/services/ngalert/notifier/alertmanager.go

Co-authored-by: Matthew Jacobson <matthew.jacobson@grafana.com>

* remove call to fmt.Println

* clear naming for fields

* shorter variable names in GetReceivers

Co-authored-by: Matthew Jacobson <matthew.jacobson@grafana.com>
(cherry picked from commit 09f8e026a1)
This commit is contained in:
Santiago
2022-10-14 12:22:33 -03:00
committed by GitHub
parent 96ae3b7e45
commit ba9293b09d
15 changed files with 659 additions and 439 deletions
@@ -728,7 +728,7 @@ func TestNotificationChannels(t *testing.T) {
channels.GetBoundary = func() string { return "abcd" }
env.NotificationService.EmailHandlerSync = mockEmail.sendEmailCommandHandlerSync
// As we are using a NotificationService mock here, but he test expects real NotificationService -
// As we are using a NotificationService mock here, but the test expects real NotificationService -
// we try to issue a real POST request here
env.NotificationService.WebhookHandler = func(_ context.Context, cmd *models.SendWebhookSync) error {
if res, err := http.Post(cmd.Url, "", strings.NewReader(cmd.Body)); err == nil {
@@ -770,11 +770,31 @@ func TestNotificationChannels(t *testing.T) {
re := regexp.MustCompile(`"uid":"([\w|-]*)"`)
e := getExpAlertmanagerConfigFromAPI(mockChannel.server.Addr)
require.JSONEq(t, e, string(re.ReplaceAll([]byte(b), []byte(`"uid":""`))))
// Check the receivers API. No errors nor attempts to notify should be registered.
receiversURL := fmt.Sprintf("http://grafana:password@%s/api/alertmanager/grafana/config/api/v1/receivers", grafanaListedAddr)
resp = getRequest(t, receiversURL, http.StatusOK) // nolint
b = getBody(t, resp.Body)
var receivers apimodels.Receivers
err := json.Unmarshal([]byte(b), &receivers)
require.NoError(t, err)
for _, rcv := range receivers {
require.NotNil(t, rcv.Name)
require.NotNil(t, rcv.Active)
require.NotEmpty(t, rcv.Integrations)
for _, integration := range rcv.Integrations {
require.NotNil(t, integration.Name)
require.NotNil(t, integration.SendResolved)
require.Equal(t, "", integration.LastNotifyAttemptError)
require.Zero(t, integration.LastNotifyAttempt)
require.Equal(t, "0s", integration.LastNotifyAttemptDuration)
}
}
}
{
// Create rules that will fire as quickly as possible
originalFunction := store.GenerateNewAlertRuleUID
t.Cleanup(func() {
store.GenerateNewAlertRuleUID = originalFunction
@@ -791,6 +811,7 @@ func TestNotificationChannels(t *testing.T) {
// Eventually, we'll get all the desired alerts.
// nolint:gosec
require.Eventually(t, func() bool {
// TODO: not waiting for the failed notifications, flaky test?
return mockChannel.totalNotifications() >= len(nonEmailAlertNames) && len(mockEmail.emails) >= 1
}, 30*time.Second, 1*time.Second)
@@ -798,6 +819,60 @@ func TestNotificationChannels(t *testing.T) {
require.Equal(t, expEmailNotifications, mockEmail.emails)
require.NoError(t, mockChannel.Close())
// Check the receivers API. Errors and inactive receivers are expected, attempts to deliver notifications should be registered.
receiversURL := fmt.Sprintf("http://grafana:password@%s/api/alertmanager/grafana/config/api/v1/receivers", grafanaListedAddr)
resp := getRequest(t, receiversURL, http.StatusOK) // nolint
b := getBody(t, resp.Body)
var receivers apimodels.Receivers
err := json.Unmarshal([]byte(b), &receivers)
require.NoError(t, err)
for _, rcv := range receivers {
var expActive bool
if _, ok := expInactiveReceivers[*rcv.Name]; !ok {
expActive = true
}
var expErr bool
if _, ok := expNotificationErrors[*rcv.Name]; ok {
expErr = true
}
require.NotNil(t, rcv.Name)
require.NotNil(t, rcv.Active)
require.NotEmpty(t, rcv.Integrations)
if expActive {
require.True(t, *rcv.Active)
}
// We don't have test alerts for the default notifier, continue iterating.
if *rcv.Name == "grafana-default-email" {
continue
}
for _, integration := range rcv.Integrations {
require.NotNil(t, integration.Name)
require.NotNil(t, integration.SendResolved)
// If the receiver is not active, no attempts to send notifications should be registered.
if expActive {
require.NotZero(t, integration.LastNotifyAttempt)
require.NotEqual(t, "0s", integration.LastNotifyAttemptDuration)
} else {
require.Zero(t, integration.LastNotifyAttempt)
require.Equal(t, "0s", integration.LastNotifyAttemptDuration)
}
// Check whether we're expecting an error on this integration.
if expErr {
for _, integration := range rcv.Integrations {
require.Equal(t, expNotificationErrors[*rcv.Name], integration.LastNotifyAttemptError)
}
} else {
require.Equal(t, "", integration.LastNotifyAttemptError)
}
}
}
{
// Delete the configuration; so it returns the default configuration.
u := fmt.Sprintf("http://grafana:password@%s/api/alertmanager/grafana/config/api/v1/alerts", grafanaListedAddr)
@@ -859,6 +934,10 @@ var emailAlertNames = []string{
"EmailAlert",
}
var failedAlertNames = []string{
"SlackFailedAlert",
}
func getRulesConfig(t *testing.T) string {
t.Helper()
interval, err := model.ParseDuration("10s")
@@ -869,7 +948,10 @@ func getRulesConfig(t *testing.T) string {
}
// Create rules that will fire as quickly as possible for all the routes.
for _, alertName := range append(nonEmailAlertNames, emailAlertNames...) {
rulesToCreate := append(nonEmailAlertNames, emailAlertNames...)
rulesToCreate = append(rulesToCreate, failedAlertNames...)
for _, alertName := range rulesToCreate {
rules.Rules = append(rules.Rules, apimodels.PostableExtendedRuleNode{
GrafanaManagedAlert: &apimodels.PostableGrafanaRule{
Title: alertName,
@@ -1124,6 +1206,26 @@ const alertmanagerConfig = `
"alertname=\"SlackAlert2\""
]
},
{
"receiver": "slack_failed_recv",
"group_wait": "0s",
"group_by": [
"alertname"
],
"matchers": [
"alertname=\"SlackFailedAlert\""
]
},
{
"receiver": "slack_inactive_recv",
"group_wait": "0s",
"group_by": [
"alertname"
],
"matchers": [
"alertname=\"Inactive\""
]
},
{
"receiver": "pagerduty_recv",
"group_wait": "0s",
@@ -1273,7 +1375,7 @@ const alertmanagerConfig = `
"matchers": [
"alertname=\"TelegramAlert\""
]
}
}
]
},
"receivers": [
@@ -1483,7 +1585,7 @@ const alertmanagerConfig = `
}
]
},
{
{
"name": "slack_recv1",
"grafana_managed_receiver_configs": [
{
@@ -1506,8 +1608,8 @@ const alertmanagerConfig = `
}
}
]
},
{
},
{
"name": "slack_recv2",
"grafana_managed_receiver_configs": [
{
@@ -1523,6 +1625,39 @@ const alertmanagerConfig = `
}
}
]
},
{
"name": "slack_failed_recv",
"grafana_managed_receiver_configs": [
{
"name": "slack_failed_test",
"type": "slack",
"settings": {
"recipient": "#test-channel",
"username": "test",
"text": "Integration Test"
},
"secureSettings": {
"url": "htt://127.0.0.1:8080/slack_failed_recv/slack_failed_test"
}
}
]
},
{
"name": "slack_inactive_recv",
"grafana_managed_receiver_configs": [
{
"name": "inactive",
"type": "slack",
"settings": {
"recipient": "#inactive-channel",
"username": "Integration Test"
},
"secureSettings": {
"token": "myfullysecrettoken"
}
}
]
},
{
"name": "pagerduty_recv",
@@ -1589,7 +1724,26 @@ var expAlertmanagerConfigFromAPI = `
"alertname=\"SlackAlert2\""
]
},
{
{
"receiver": "slack_failed_recv",
"group_wait": "0s",
"group_by": [
"alertname"
],
"matchers": [
"alertname=\"SlackFailedAlert\""
]
},
{
"receiver": "slack_inactive_recv",
"group_wait": "0s",
"group_by": [
"alertname"
],
"matchers": [
"alertname=\"Inactive\""
]
}, {
"receiver": "pagerduty_recv",
"group_wait": "0s",
"group_by": [
@@ -1738,7 +1892,7 @@ var expAlertmanagerConfigFromAPI = `
"matchers": [
"alertname=\"TelegramAlert\""
]
}
}
]
},
"templates": null,
@@ -1986,7 +2140,7 @@ var expAlertmanagerConfigFromAPI = `
}
]
},
{
{
"name": "slack_recv1",
"grafana_managed_receiver_configs": [
{
@@ -2031,6 +2185,43 @@ var expAlertmanagerConfigFromAPI = `
}
]
},
{
"name": "slack_failed_recv",
"grafana_managed_receiver_configs": [
{
"uid": "",
"name": "slack_failed_test",
"type": "slack",
"disableResolveMessage": false,
"settings": {
"recipient": "#test-channel",
"username": "test",
"text": "Integration Test"
},
"secureFields": {
"url": true
}
}
]
},
{
"name": "slack_inactive_recv",
"grafana_managed_receiver_configs": [
{
"uid": "",
"name": "inactive",
"type": "slack",
"disableResolveMessage": false,
"settings": {
"recipient": "#inactive-channel",
"username": "Integration Test"
},
"secureFields": {
"token": true
}
}
]
},
{
"name": "pagerduty_recv",
"grafana_managed_receiver_configs": [
@@ -2435,3 +2626,13 @@ var expNonEmailNotifications = map[string][]string{
]`,
},
}
// expNotificationErrors maps a receiver name with its expected error string.
var expNotificationErrors = map[string]string{
"slack_failed_recv": `Post "htt://127.0.0.1:8080/slack_failed_recv/slack_failed_test": unsupported protocol scheme "htt"`,
}
// expNotificationErrors maps a receiver name with its expected error string.
var expInactiveReceivers = map[string]struct{}{
"slack_inactive_recv": {},
}