Files
grafana/pkg/services/ngalert/backtesting/engine.go
Yuri Tseretyan fa1e6cce5e Alerting: Rule backtesting with experimental UI (#115525)
* add function to convert StateTransition to LokiEntry
* add QueryResultBuilder
* update backtesting to produce result similar to historian
* make shouldRecord public
* filter out noop transitions
* add experimental front-end
* add new fields
* move conversion of api model to AlertRule to validation
* add extra labels
* calculate tick timestamp using the same logic as in scheduler
* implement correct logic of calculating first evaluation timestamp
* add uid, group and folder uid they are needed for jitter strategy

* add JitterOffsetInDuration and JitterStrategy.String()

* add config `backtesting_max_evaluations` to [unified_alerting] (not documented for now)

* remove obsolete tests

* elevate permisisons for backtesting endpoint
* move backtesting to separate dir
2025-12-26 16:55:57 -05:00

303 lines
11 KiB
Go

package backtesting
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/url"
"time"
"github.com/benbjohnson/clock"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/apimachinery/identity"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/featuremgmt"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/schedule"
"github.com/grafana/grafana/pkg/services/ngalert/schedule/ticker"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/services/ngalert/state/historian"
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
"github.com/grafana/grafana/pkg/setting"
"github.com/grafana/grafana/pkg/util"
)
var (
ErrInvalidInputData = errors.New("invalid input data")
logger = log.New("ngalert.backtesting.engine")
backtestingEvaluatorFactory = newBacktestingEvaluator
)
type callbackFunc = func(evaluationIndex int, now time.Time, results eval.Results) (bool, error)
type backtestingEvaluator interface {
Eval(ctx context.Context, from time.Time, interval time.Duration, evaluations int, callback callbackFunc) error
}
type stateManager interface {
ProcessEvalResults(context.Context, time.Time, *models.AlertRule, eval.Results, data.Labels, state.Sender) state.StateTransitions
schedule.RuleStateProvider
}
type Engine struct {
evalFactory eval.EvaluatorFactory
createStateManager func() stateManager
disableGrafanaFolder bool
featureToggles featuremgmt.FeatureToggles
minInterval time.Duration
baseInterval time.Duration
jitterStrategy schedule.JitterStrategy
maxEvaluations int
}
func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory, tracer tracing.Tracer, cfg setting.UnifiedAlertingSettings, toggles featuremgmt.FeatureToggles) *Engine {
return &Engine{
evalFactory: evalFactory,
createStateManager: func() stateManager {
cfg := state.ManagerCfg{
Metrics: nil,
ExternalURL: appUrl,
InstanceStore: nil,
Images: &NoopImageService{},
Clock: clock.New(),
Historian: nil,
Tracer: tracer,
Log: log.New("ngalert.state.manager"),
}
return state.NewManager(cfg, state.NewNoopPersister())
},
disableGrafanaFolder: false,
featureToggles: toggles,
minInterval: cfg.MinInterval,
baseInterval: cfg.BaseInterval,
maxEvaluations: cfg.BacktestingMaxEvaluations,
jitterStrategy: schedule.JitterStrategyFrom(cfg, toggles),
}
}
func (e *Engine) Test(ctx context.Context, user identity.Requester, rule *models.AlertRule, from, to time.Time, folderTitle string) (res *data.Frame, err error) {
if rule == nil {
return nil, fmt.Errorf("%w: rule is not defined", ErrInvalidInputData)
}
if !from.Before(to) {
return nil, fmt.Errorf("%w: invalid interval [%d,%d]", ErrInvalidInputData, from.Unix(), to.Unix())
}
ruleCtx := models.WithRuleKey(ctx, rule.GetKey())
logger := logger.FromContext(ruleCtx).New("backtesting", util.GenerateShortUID())
var warns []string
if rule.GetInterval() < e.minInterval {
logger.Warn("Interval adjusted to minimal interval", "originalInterval", rule.GetInterval(), "adjustedInterval", e.minInterval)
rule = rule.Copy()
rule.IntervalSeconds = int64(e.minInterval.Seconds())
warns = append(warns, fmt.Sprintf("Interval adjusted to minimal interval %ds", rule.IntervalSeconds))
}
effectiveStrategy := e.jitterStrategy
if e.jitterStrategy == schedule.JitterByGroup && (rule.RuleGroup == "" || rule.NamespaceUID == "") ||
e.jitterStrategy == schedule.JitterByRule && rule.UID == "" {
logger.Warn(fmt.Sprintf("Jitter strategy is set to %s, but rule group or namespace is not set. Ignore jitter", e.jitterStrategy))
warns = append(warns, fmt.Sprintf("Jitter strategy is set to %s, but rule group or namespace is not set. Ignore jitter. The results of testing will be different than real evaluations", e.jitterStrategy))
effectiveStrategy = schedule.JitterNever
}
jitterOffset := schedule.JitterOffsetInDuration(rule, e.baseInterval, effectiveStrategy)
firstEval, err := getFirstEvaluationTime(from, rule, e.baseInterval, jitterOffset)
if err != nil {
return nil, fmt.Errorf("%w: %s", ErrInvalidInputData, err)
}
evaluations := calculateNumberOfEvaluations(firstEval, to, rule.GetInterval())
if e.maxEvaluations > 0 && evaluations > e.maxEvaluations {
logger.Warn("Evaluations adjusted to maximal number", "originalEvaluations", evaluations, "adjustedEvaluations", e.maxEvaluations)
warns = append(warns, fmt.Sprintf("Number of evaluations are adjusted to the limit of %d evaluations. Requested: %d", e.maxEvaluations, evaluations))
evaluations = e.maxEvaluations
}
start := time.Now()
defer func() {
if err == nil {
logger.Info("Rule testing finished successfully", "duration", time.Since(start))
} else {
logger.Error("Rule testing finished with error", "duration", time.Since(start), "error", err)
}
}()
stateMgr := e.createStateManager()
evaluator, err := backtestingEvaluatorFactory(ruleCtx,
e.evalFactory,
user,
rule.GetEvalCondition().WithSource("backtesting"),
&schedule.AlertingResultsFromRuleState{
Manager: stateMgr,
Rule: rule,
},
)
if err != nil {
return nil, errors.Join(ErrInvalidInputData, err)
}
logger.Info("Start testing alert rule", "from", from, "to", to, "interval", rule.GetInterval(), "firstTick", firstEval, "evaluations", evaluations, "jitterOffset", jitterOffset, "jitterStrategy", effectiveStrategy)
var builder *historian.QueryResultBuilder
ruleMeta := history_model.RuleMeta{
ID: rule.ID,
OrgID: rule.OrgID,
UID: rule.UID,
Title: rule.Title,
Group: rule.RuleGroup,
NamespaceUID: rule.NamespaceUID,
// DashboardUID: "",
// PanelID: 0,
Condition: rule.Condition,
}
labels := map[string]string{
historian.OrgIDLabel: fmt.Sprint(ruleMeta.OrgID),
historian.GroupLabel: fmt.Sprint(ruleMeta.Group),
historian.FolderUIDLabel: fmt.Sprint(rule.NamespaceUID),
}
labelsBytes, err := json.Marshal(labels)
if err != nil {
return nil, err
}
// Ensure fallback if empty string is passed
if folderTitle == "" {
folderTitle = "Backtesting"
}
extraLabels := state.GetRuleExtraLabels(logger, rule, folderTitle, !e.disableGrafanaFolder, e.featureToggles)
processFn := func(idx int, currentTime time.Time, results eval.Results) (bool, error) {
// init the builder. Do the best guess for the size of the result
if builder == nil {
builder = historian.NewQueryResultBuilder(evaluations * len(results))
for _, warn := range warns {
builder.AddWarn(warn)
}
}
states := stateMgr.ProcessEvalResults(ruleCtx, currentTime, rule, results, extraLabels, nil)
for _, s := range states {
if !historian.ShouldRecord(s) {
continue
}
entry := historian.StateTransitionToLokiEntry(ruleMeta, s)
err := builder.AddRow(currentTime, entry, labelsBytes)
if err != nil {
return false, err
}
}
return idx <= evaluations, nil
}
err = evaluator.Eval(ruleCtx, firstEval, rule.GetInterval(), evaluations, processFn)
if err != nil {
return nil, err
}
if builder == nil {
return nil, errors.New("no results were produced")
}
return builder.ToFrame(), nil
}
func newBacktestingEvaluator(ctx context.Context, evalFactory eval.EvaluatorFactory, user identity.Requester, condition models.Condition, reader eval.AlertingResultsReader) (backtestingEvaluator, error) {
for _, q := range condition.Data {
if q.DatasourceUID == "__data__" || q.QueryType == "__data__" {
if len(condition.Data) != 1 {
return nil, errors.New("data queries are not supported with other expressions or data queries")
}
if condition.Condition == "" {
return nil, fmt.Errorf("condition must not be empty and be set to the data query %s", q.RefID)
}
if condition.Condition != q.RefID {
return nil, fmt.Errorf("condition must be set to the data query %s", q.RefID)
}
model := struct {
DataFrame *data.Frame `json:"data"`
}{}
err := json.Unmarshal(q.Model, &model)
if err != nil {
return nil, fmt.Errorf("failed to parse data frame: %w", err)
}
if model.DataFrame == nil {
return nil, errors.New("the data field must not be empty")
}
return newDataEvaluator(condition.Condition, model.DataFrame)
}
}
evaluator, err := evalFactory.Create(eval.NewContextWithPreviousResults(ctx, user, reader), condition)
if err != nil {
return nil, err
}
return &queryEvaluator{
eval: evaluator,
}, nil
}
// NoopImageService is a no-op image service.
type NoopImageService struct{}
func (s *NoopImageService) NewImage(_ context.Context, _ *models.AlertRule) (*models.Image, error) {
return &models.Image{}, nil
}
func getNextEvaluationTime(currentTime time.Time, rule *models.AlertRule, baseInterval time.Duration, jitterOffset time.Duration) (time.Time, error) {
if rule.IntervalSeconds%int64(baseInterval.Seconds()) != 0 {
return time.Time{}, fmt.Errorf("interval %ds is not divisible by base interval %ds", rule.IntervalSeconds, int64(baseInterval.Seconds()))
}
freq := rule.IntervalSeconds / int64(baseInterval.Seconds())
firstTickNum := currentTime.Unix() / int64(baseInterval.Seconds())
jitterOffsetTicks := int64(jitterOffset / baseInterval)
firstEvalTickNum := firstTickNum + (jitterOffsetTicks-(firstTickNum%freq)+freq)%freq
return time.Unix(firstEvalTickNum*int64(baseInterval.Seconds()), 0), nil
}
func getFirstEvaluationTime(from time.Time, rule *models.AlertRule, baseInterval time.Duration, jitterOffset time.Duration) (time.Time, error) {
// Now calculate the time of the tick the same way as in the scheduler
firstTick := ticker.GetStartTick(from, baseInterval)
// calculate time of the first evaluation that is at or after the first tick
firstEval, err := getNextEvaluationTime(firstTick, rule, baseInterval, jitterOffset)
if err != nil {
return time.Time{}, err
}
// Ensure firstEval is at or after from
// Calculate how many intervals to skip to get past 'from'
if firstEval.Before(from) {
diff := from.Sub(firstEval)
interval := rule.GetInterval()
// Ceiling division: how many intervals needed to cover the difference
intervalsToAdd := (diff + interval - 1) / interval
firstEval = firstEval.Add(interval * intervalsToAdd)
}
return firstEval, nil
}
func calculateNumberOfEvaluations(firstEval, to time.Time, interval time.Duration) int {
var evaluations int
if to.After(firstEval) {
evaluations = int(to.Sub(firstEval).Seconds()) / int(interval.Seconds())
}
if evaluations == 0 {
evaluations = 1
}
return evaluations
}