* add function to convert StateTransition to LokiEntry * add QueryResultBuilder * update backtesting to produce result similar to historian * make shouldRecord public * filter out noop transitions * add experimental front-end * add new fields * move conversion of api model to AlertRule to validation * add extra labels * calculate tick timestamp using the same logic as in scheduler * implement correct logic of calculating first evaluation timestamp * add uid, group and folder uid they are needed for jitter strategy * add JitterOffsetInDuration and JitterStrategy.String() * add config `backtesting_max_evaluations` to [unified_alerting] (not documented for now) * remove obsolete tests * elevate permisisons for backtesting endpoint * move backtesting to separate dir
303 lines
11 KiB
Go
303 lines
11 KiB
Go
package backtesting
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"net/url"
|
|
"time"
|
|
|
|
"github.com/benbjohnson/clock"
|
|
|
|
"github.com/grafana/grafana-plugin-sdk-go/data"
|
|
|
|
"github.com/grafana/grafana/pkg/apimachinery/identity"
|
|
"github.com/grafana/grafana/pkg/infra/log"
|
|
"github.com/grafana/grafana/pkg/infra/tracing"
|
|
"github.com/grafana/grafana/pkg/services/featuremgmt"
|
|
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
|
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
|
"github.com/grafana/grafana/pkg/services/ngalert/schedule"
|
|
"github.com/grafana/grafana/pkg/services/ngalert/schedule/ticker"
|
|
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
|
"github.com/grafana/grafana/pkg/services/ngalert/state/historian"
|
|
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
|
|
"github.com/grafana/grafana/pkg/setting"
|
|
"github.com/grafana/grafana/pkg/util"
|
|
)
|
|
|
|
var (
|
|
ErrInvalidInputData = errors.New("invalid input data")
|
|
|
|
logger = log.New("ngalert.backtesting.engine")
|
|
backtestingEvaluatorFactory = newBacktestingEvaluator
|
|
)
|
|
|
|
type callbackFunc = func(evaluationIndex int, now time.Time, results eval.Results) (bool, error)
|
|
|
|
type backtestingEvaluator interface {
|
|
Eval(ctx context.Context, from time.Time, interval time.Duration, evaluations int, callback callbackFunc) error
|
|
}
|
|
|
|
type stateManager interface {
|
|
ProcessEvalResults(context.Context, time.Time, *models.AlertRule, eval.Results, data.Labels, state.Sender) state.StateTransitions
|
|
schedule.RuleStateProvider
|
|
}
|
|
|
|
type Engine struct {
|
|
evalFactory eval.EvaluatorFactory
|
|
createStateManager func() stateManager
|
|
disableGrafanaFolder bool
|
|
featureToggles featuremgmt.FeatureToggles
|
|
minInterval time.Duration
|
|
baseInterval time.Duration
|
|
jitterStrategy schedule.JitterStrategy
|
|
maxEvaluations int
|
|
}
|
|
|
|
func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory, tracer tracing.Tracer, cfg setting.UnifiedAlertingSettings, toggles featuremgmt.FeatureToggles) *Engine {
|
|
return &Engine{
|
|
evalFactory: evalFactory,
|
|
createStateManager: func() stateManager {
|
|
cfg := state.ManagerCfg{
|
|
Metrics: nil,
|
|
ExternalURL: appUrl,
|
|
InstanceStore: nil,
|
|
Images: &NoopImageService{},
|
|
Clock: clock.New(),
|
|
Historian: nil,
|
|
Tracer: tracer,
|
|
Log: log.New("ngalert.state.manager"),
|
|
}
|
|
return state.NewManager(cfg, state.NewNoopPersister())
|
|
},
|
|
disableGrafanaFolder: false,
|
|
featureToggles: toggles,
|
|
minInterval: cfg.MinInterval,
|
|
baseInterval: cfg.BaseInterval,
|
|
maxEvaluations: cfg.BacktestingMaxEvaluations,
|
|
jitterStrategy: schedule.JitterStrategyFrom(cfg, toggles),
|
|
}
|
|
}
|
|
|
|
func (e *Engine) Test(ctx context.Context, user identity.Requester, rule *models.AlertRule, from, to time.Time, folderTitle string) (res *data.Frame, err error) {
|
|
if rule == nil {
|
|
return nil, fmt.Errorf("%w: rule is not defined", ErrInvalidInputData)
|
|
}
|
|
if !from.Before(to) {
|
|
return nil, fmt.Errorf("%w: invalid interval [%d,%d]", ErrInvalidInputData, from.Unix(), to.Unix())
|
|
}
|
|
|
|
ruleCtx := models.WithRuleKey(ctx, rule.GetKey())
|
|
logger := logger.FromContext(ruleCtx).New("backtesting", util.GenerateShortUID())
|
|
|
|
var warns []string
|
|
if rule.GetInterval() < e.minInterval {
|
|
logger.Warn("Interval adjusted to minimal interval", "originalInterval", rule.GetInterval(), "adjustedInterval", e.minInterval)
|
|
rule = rule.Copy()
|
|
rule.IntervalSeconds = int64(e.minInterval.Seconds())
|
|
warns = append(warns, fmt.Sprintf("Interval adjusted to minimal interval %ds", rule.IntervalSeconds))
|
|
}
|
|
|
|
effectiveStrategy := e.jitterStrategy
|
|
if e.jitterStrategy == schedule.JitterByGroup && (rule.RuleGroup == "" || rule.NamespaceUID == "") ||
|
|
e.jitterStrategy == schedule.JitterByRule && rule.UID == "" {
|
|
logger.Warn(fmt.Sprintf("Jitter strategy is set to %s, but rule group or namespace is not set. Ignore jitter", e.jitterStrategy))
|
|
warns = append(warns, fmt.Sprintf("Jitter strategy is set to %s, but rule group or namespace is not set. Ignore jitter. The results of testing will be different than real evaluations", e.jitterStrategy))
|
|
effectiveStrategy = schedule.JitterNever
|
|
}
|
|
jitterOffset := schedule.JitterOffsetInDuration(rule, e.baseInterval, effectiveStrategy)
|
|
firstEval, err := getFirstEvaluationTime(from, rule, e.baseInterval, jitterOffset)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("%w: %s", ErrInvalidInputData, err)
|
|
}
|
|
|
|
evaluations := calculateNumberOfEvaluations(firstEval, to, rule.GetInterval())
|
|
if e.maxEvaluations > 0 && evaluations > e.maxEvaluations {
|
|
logger.Warn("Evaluations adjusted to maximal number", "originalEvaluations", evaluations, "adjustedEvaluations", e.maxEvaluations)
|
|
warns = append(warns, fmt.Sprintf("Number of evaluations are adjusted to the limit of %d evaluations. Requested: %d", e.maxEvaluations, evaluations))
|
|
evaluations = e.maxEvaluations
|
|
}
|
|
|
|
start := time.Now()
|
|
defer func() {
|
|
if err == nil {
|
|
logger.Info("Rule testing finished successfully", "duration", time.Since(start))
|
|
} else {
|
|
logger.Error("Rule testing finished with error", "duration", time.Since(start), "error", err)
|
|
}
|
|
}()
|
|
|
|
stateMgr := e.createStateManager()
|
|
|
|
evaluator, err := backtestingEvaluatorFactory(ruleCtx,
|
|
e.evalFactory,
|
|
user,
|
|
rule.GetEvalCondition().WithSource("backtesting"),
|
|
&schedule.AlertingResultsFromRuleState{
|
|
Manager: stateMgr,
|
|
Rule: rule,
|
|
},
|
|
)
|
|
if err != nil {
|
|
return nil, errors.Join(ErrInvalidInputData, err)
|
|
}
|
|
|
|
logger.Info("Start testing alert rule", "from", from, "to", to, "interval", rule.GetInterval(), "firstTick", firstEval, "evaluations", evaluations, "jitterOffset", jitterOffset, "jitterStrategy", effectiveStrategy)
|
|
|
|
var builder *historian.QueryResultBuilder
|
|
|
|
ruleMeta := history_model.RuleMeta{
|
|
ID: rule.ID,
|
|
OrgID: rule.OrgID,
|
|
UID: rule.UID,
|
|
Title: rule.Title,
|
|
Group: rule.RuleGroup,
|
|
NamespaceUID: rule.NamespaceUID,
|
|
// DashboardUID: "",
|
|
// PanelID: 0,
|
|
Condition: rule.Condition,
|
|
}
|
|
labels := map[string]string{
|
|
historian.OrgIDLabel: fmt.Sprint(ruleMeta.OrgID),
|
|
historian.GroupLabel: fmt.Sprint(ruleMeta.Group),
|
|
historian.FolderUIDLabel: fmt.Sprint(rule.NamespaceUID),
|
|
}
|
|
labelsBytes, err := json.Marshal(labels)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Ensure fallback if empty string is passed
|
|
if folderTitle == "" {
|
|
folderTitle = "Backtesting"
|
|
}
|
|
extraLabels := state.GetRuleExtraLabels(logger, rule, folderTitle, !e.disableGrafanaFolder, e.featureToggles)
|
|
|
|
processFn := func(idx int, currentTime time.Time, results eval.Results) (bool, error) {
|
|
// init the builder. Do the best guess for the size of the result
|
|
if builder == nil {
|
|
builder = historian.NewQueryResultBuilder(evaluations * len(results))
|
|
for _, warn := range warns {
|
|
builder.AddWarn(warn)
|
|
}
|
|
}
|
|
states := stateMgr.ProcessEvalResults(ruleCtx, currentTime, rule, results, extraLabels, nil)
|
|
for _, s := range states {
|
|
if !historian.ShouldRecord(s) {
|
|
continue
|
|
}
|
|
entry := historian.StateTransitionToLokiEntry(ruleMeta, s)
|
|
err := builder.AddRow(currentTime, entry, labelsBytes)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
}
|
|
return idx <= evaluations, nil
|
|
}
|
|
|
|
err = evaluator.Eval(ruleCtx, firstEval, rule.GetInterval(), evaluations, processFn)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if builder == nil {
|
|
return nil, errors.New("no results were produced")
|
|
}
|
|
return builder.ToFrame(), nil
|
|
}
|
|
|
|
func newBacktestingEvaluator(ctx context.Context, evalFactory eval.EvaluatorFactory, user identity.Requester, condition models.Condition, reader eval.AlertingResultsReader) (backtestingEvaluator, error) {
|
|
for _, q := range condition.Data {
|
|
if q.DatasourceUID == "__data__" || q.QueryType == "__data__" {
|
|
if len(condition.Data) != 1 {
|
|
return nil, errors.New("data queries are not supported with other expressions or data queries")
|
|
}
|
|
if condition.Condition == "" {
|
|
return nil, fmt.Errorf("condition must not be empty and be set to the data query %s", q.RefID)
|
|
}
|
|
if condition.Condition != q.RefID {
|
|
return nil, fmt.Errorf("condition must be set to the data query %s", q.RefID)
|
|
}
|
|
model := struct {
|
|
DataFrame *data.Frame `json:"data"`
|
|
}{}
|
|
err := json.Unmarshal(q.Model, &model)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse data frame: %w", err)
|
|
}
|
|
if model.DataFrame == nil {
|
|
return nil, errors.New("the data field must not be empty")
|
|
}
|
|
return newDataEvaluator(condition.Condition, model.DataFrame)
|
|
}
|
|
}
|
|
|
|
evaluator, err := evalFactory.Create(eval.NewContextWithPreviousResults(ctx, user, reader), condition)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &queryEvaluator{
|
|
eval: evaluator,
|
|
}, nil
|
|
}
|
|
|
|
// NoopImageService is a no-op image service.
|
|
type NoopImageService struct{}
|
|
|
|
func (s *NoopImageService) NewImage(_ context.Context, _ *models.AlertRule) (*models.Image, error) {
|
|
return &models.Image{}, nil
|
|
}
|
|
|
|
func getNextEvaluationTime(currentTime time.Time, rule *models.AlertRule, baseInterval time.Duration, jitterOffset time.Duration) (time.Time, error) {
|
|
if rule.IntervalSeconds%int64(baseInterval.Seconds()) != 0 {
|
|
return time.Time{}, fmt.Errorf("interval %ds is not divisible by base interval %ds", rule.IntervalSeconds, int64(baseInterval.Seconds()))
|
|
}
|
|
|
|
freq := rule.IntervalSeconds / int64(baseInterval.Seconds())
|
|
|
|
firstTickNum := currentTime.Unix() / int64(baseInterval.Seconds())
|
|
|
|
jitterOffsetTicks := int64(jitterOffset / baseInterval)
|
|
|
|
firstEvalTickNum := firstTickNum + (jitterOffsetTicks-(firstTickNum%freq)+freq)%freq
|
|
|
|
return time.Unix(firstEvalTickNum*int64(baseInterval.Seconds()), 0), nil
|
|
}
|
|
|
|
func getFirstEvaluationTime(from time.Time, rule *models.AlertRule, baseInterval time.Duration, jitterOffset time.Duration) (time.Time, error) {
|
|
// Now calculate the time of the tick the same way as in the scheduler
|
|
firstTick := ticker.GetStartTick(from, baseInterval)
|
|
|
|
// calculate time of the first evaluation that is at or after the first tick
|
|
firstEval, err := getNextEvaluationTime(firstTick, rule, baseInterval, jitterOffset)
|
|
if err != nil {
|
|
return time.Time{}, err
|
|
}
|
|
|
|
// Ensure firstEval is at or after from
|
|
// Calculate how many intervals to skip to get past 'from'
|
|
if firstEval.Before(from) {
|
|
diff := from.Sub(firstEval)
|
|
interval := rule.GetInterval()
|
|
// Ceiling division: how many intervals needed to cover the difference
|
|
intervalsToAdd := (diff + interval - 1) / interval
|
|
firstEval = firstEval.Add(interval * intervalsToAdd)
|
|
}
|
|
|
|
return firstEval, nil
|
|
}
|
|
|
|
func calculateNumberOfEvaluations(firstEval, to time.Time, interval time.Duration) int {
|
|
var evaluations int
|
|
if to.After(firstEval) {
|
|
evaluations = int(to.Sub(firstEval).Seconds()) / int(interval.Seconds())
|
|
}
|
|
if evaluations == 0 {
|
|
evaluations = 1
|
|
}
|
|
return evaluations
|
|
}
|