Files
grafana/pkg/services/ngalert/notifier/historian.go
T
Vadim Stepanov bccc980b90 Alerting: Notifiication history (#107644)
* Add unified_alerting.notification_history to ini files

* Parse notification history settings

* Move Loki client to a separate package

* Loki client: add params for metrics and traces

* add NotificationHistorian

* rm writeDuration

* remove RangeQuery stuff

* wip

* wip

* wip

* wip

* pass notification historian in tests

* unify loki settings

* unify loki settings

* add test

* update grafana/alerting

* make update-workspace

* add feature toggle

* fix configureNotificationHistorian

* Revert "add feature toggle"

This reverts commit de7af8f7

* add feature toggle

* more tests

* RuleUID

* fix metrics test

* met.Info.Set(0)
2025-07-17 14:26:26 +01:00

191 lines
6.4 KiB
Go

package notifier
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
alertingModels "github.com/grafana/alerting/models"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/ngalert/client"
"github.com/grafana/grafana/pkg/services/ngalert/lokiclient"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/prometheus/alertmanager/notify"
"github.com/prometheus/alertmanager/types"
prometheusModel "github.com/prometheus/common/model"
"go.opentelemetry.io/otel/trace"
)
const LokiClientSpanName = "ngalert.notification-historian.client"
const NotificationHistoryWriteTimeout = time.Minute
const NotificationHistoryKey = "from"
const NotificationHistoryLabelValue = "notify-history"
type NotificationHistoryLokiEntry struct {
SchemaVersion int `json:"schemaVersion"`
Receiver string `json:"receiver"`
Status string `json:"status"`
GroupLabels map[string]string `json:"groupLabels"`
Alerts []NotificationHistoryLokiEntryAlert `json:"alerts"`
Retry bool `json:"retry"`
Error string `json:"error,omitempty"`
Duration int64 `json:"duration"`
}
type NotificationHistoryLokiEntryAlert struct {
Status string `json:"status"`
Labels map[string]string `json:"labels"`
Annotations map[string]string `json:"annotations"`
StartsAt time.Time `json:"startsAt"`
EndsAt time.Time `json:"endsAt"`
RuleUID string `json:"ruleUID"`
}
type remoteLokiClient interface {
Ping(context.Context) error
Push(context.Context, []lokiclient.Stream) error
}
type NotificationHistorian struct {
client remoteLokiClient
externalLabels map[string]string
metrics *metrics.NotificationHistorian
log log.Logger
}
func NewNotificationHistorian(logger log.Logger, cfg lokiclient.LokiConfig, req client.Requester, metrics *metrics.NotificationHistorian, tracer tracing.Tracer) *NotificationHistorian {
return &NotificationHistorian{
client: lokiclient.NewLokiClient(cfg, req, metrics.BytesWritten, metrics.WriteDuration, logger, tracer, LokiClientSpanName),
externalLabels: cfg.ExternalLabels,
metrics: metrics,
log: logger,
}
}
func (h *NotificationHistorian) TestConnection(ctx context.Context) error {
return h.client.Ping(ctx)
}
func (h *NotificationHistorian) Record(ctx context.Context, alerts []*types.Alert, retry bool, notificationErr error, duration time.Duration) <-chan error {
stream, err := h.prepareStream(ctx, alerts, retry, notificationErr, duration)
logger := h.log.FromContext(ctx)
errCh := make(chan error, 1)
if err != nil {
logger.Error("Failed to convert notification history to stream", "error", err)
errCh <- fmt.Errorf("failed to convert notification history to stream: %w", err)
close(errCh)
return errCh
}
// This is a new background job, so let's create a new context for it.
// We want it to be isolated, i.e. we don't want grafana shutdowns to interrupt this work
// immediately but rather try to flush writes.
// This also prevents timeouts or other lingering objects (like transactions) from being
// incorrectly propagated here from other areas.
writeCtx := context.Background()
writeCtx, cancel := context.WithTimeout(writeCtx, NotificationHistoryWriteTimeout)
writeCtx = trace.ContextWithSpan(writeCtx, trace.SpanFromContext(ctx))
go func(ctx context.Context) {
defer cancel()
defer close(errCh)
logger := h.log.FromContext(ctx)
logger.Debug("Saving notification history")
h.metrics.WritesTotal.Inc()
if err := h.recordStream(ctx, stream, logger); err != nil {
logger.Error("Failed to save notification history", "error", err)
h.metrics.WritesFailed.Inc()
errCh <- fmt.Errorf("failed to save notification history: %w", err)
}
}(writeCtx)
return errCh
}
func (h *NotificationHistorian) prepareStream(ctx context.Context, alerts []*types.Alert, retry bool, notificationErr error, duration time.Duration) (lokiclient.Stream, error) {
receiverName, ok := notify.ReceiverName(ctx)
if !ok {
return lokiclient.Stream{}, fmt.Errorf("receiver name not found in context")
}
groupLabels, ok := notify.GroupLabels(ctx)
if !ok {
return lokiclient.Stream{}, fmt.Errorf("group labels not found in context")
}
now, ok := notify.Now(ctx)
if !ok {
return lokiclient.Stream{}, fmt.Errorf("now not found in context")
}
entryAlerts := make([]NotificationHistoryLokiEntryAlert, len(alerts))
for i, alert := range alerts {
labels := prepareLabels(alert.Labels)
annotations := prepareLabels(alert.Annotations)
entryAlerts[i] = NotificationHistoryLokiEntryAlert{
Labels: labels,
Annotations: annotations,
Status: string(alert.StatusAt(now)),
StartsAt: alert.StartsAt,
EndsAt: alert.EndsAt,
RuleUID: string(alert.Labels[alertingModels.RuleUIDLabel]),
}
}
notificationErrStr := ""
if notificationErr != nil {
notificationErrStr = notificationErr.Error()
}
entry := NotificationHistoryLokiEntry{
SchemaVersion: 1,
Receiver: receiverName,
Status: string(types.Alerts(alerts...).StatusAt(now)),
GroupLabels: prepareLabels(groupLabels),
Alerts: entryAlerts,
Retry: retry,
Error: notificationErrStr,
Duration: duration.Milliseconds(),
}
entryJSON, err := json.Marshal(entry)
if err != nil {
return lokiclient.Stream{}, err
}
streamLabels := make(map[string]string)
streamLabels[NotificationHistoryKey] = NotificationHistoryLabelValue
for k, v := range h.externalLabels {
streamLabels[k] = v
}
return lokiclient.Stream{
Stream: streamLabels,
Values: []lokiclient.Sample{
{
T: now,
V: string(entryJSON),
}},
}, nil
}
func (h *NotificationHistorian) recordStream(ctx context.Context, stream lokiclient.Stream, logger log.Logger) error {
if err := h.client.Push(ctx, []lokiclient.Stream{stream}); err != nil {
return err
}
logger.Debug("Done saving notification history")
return nil
}
func prepareLabels(labels prometheusModel.LabelSet) map[string]string {
result := make(map[string]string)
for k, v := range labels {
// Remove private labels
if !strings.HasPrefix(string(k), "__") && !strings.HasSuffix(string(k), "__") {
result[string(k)] = string(v)
}
}
return result
}