What is this feature? This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments. Why do we need this feature? In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments. Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints. For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds. This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
152 lines
3.8 KiB
Go
152 lines
3.8 KiB
Go
package state
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math/rand"
|
|
"slices"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
|
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
|
|
"github.com/grafana/grafana/pkg/services/screenshot"
|
|
)
|
|
|
|
var _ InstanceStore = &FakeInstanceStore{}
|
|
|
|
type FakeInstanceStore struct {
|
|
mtx sync.Mutex
|
|
recordedOps []any
|
|
}
|
|
|
|
type FakeInstanceStoreOp struct {
|
|
Name string
|
|
Args []any
|
|
}
|
|
|
|
func (f *FakeInstanceStore) RecordedOps() []any {
|
|
f.mtx.Lock()
|
|
defer f.mtx.Unlock()
|
|
return slices.Clone(f.recordedOps)
|
|
}
|
|
|
|
func (f *FakeInstanceStore) ListAlertInstances(_ context.Context, q *models.ListAlertInstancesQuery) ([]*models.AlertInstance, error) {
|
|
f.mtx.Lock()
|
|
defer f.mtx.Unlock()
|
|
f.recordedOps = append(f.recordedOps, *q)
|
|
return nil, nil
|
|
}
|
|
|
|
func (f *FakeInstanceStore) SaveAlertInstance(_ context.Context, q models.AlertInstance) error {
|
|
f.mtx.Lock()
|
|
defer f.mtx.Unlock()
|
|
f.recordedOps = append(f.recordedOps, q)
|
|
return nil
|
|
}
|
|
|
|
func (f *FakeInstanceStore) DeleteAlertInstances(ctx context.Context, q ...models.AlertInstanceKey) error {
|
|
f.mtx.Lock()
|
|
defer f.mtx.Unlock()
|
|
f.recordedOps = append(f.recordedOps, FakeInstanceStoreOp{
|
|
Name: "DeleteAlertInstances", Args: []any{
|
|
ctx,
|
|
q,
|
|
},
|
|
})
|
|
return nil
|
|
}
|
|
|
|
func (f *FakeInstanceStore) SaveAlertInstancesForRule(ctx context.Context, key models.AlertRuleKeyWithGroup, instances []models.AlertInstance) error {
|
|
f.mtx.Lock()
|
|
defer f.mtx.Unlock()
|
|
|
|
f.recordedOps = append(f.recordedOps, FakeInstanceStoreOp{
|
|
Name: "SaveAlertInstancesForRule", Args: []any{
|
|
ctx,
|
|
key,
|
|
instances,
|
|
},
|
|
})
|
|
|
|
return nil
|
|
}
|
|
|
|
func (f *FakeInstanceStore) DeleteAlertInstancesByRule(ctx context.Context, key models.AlertRuleKeyWithGroup) error {
|
|
return nil
|
|
}
|
|
|
|
func (f *FakeInstanceStore) FullSync(ctx context.Context, instances []models.AlertInstance, batchSize int, jitterFunc func(int) time.Duration) error {
|
|
f.mtx.Lock()
|
|
defer f.mtx.Unlock()
|
|
f.recordedOps = []any{}
|
|
for _, instance := range instances {
|
|
f.recordedOps = append(f.recordedOps, instance)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
type FakeRuleReader struct{}
|
|
|
|
func (f *FakeRuleReader) ListAlertRules(_ context.Context, q *models.ListAlertRulesQuery) (models.RulesGroup, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
type FakeHistorian struct {
|
|
StateTransitions []StateTransition
|
|
}
|
|
|
|
func (f *FakeHistorian) Record(ctx context.Context, rule history_model.RuleMeta, states []StateTransition) <-chan error {
|
|
f.StateTransitions = append(f.StateTransitions, states...)
|
|
errCh := make(chan error)
|
|
close(errCh)
|
|
return errCh
|
|
}
|
|
|
|
// NotAvailableImageService is a service that returns ErrScreenshotsUnavailable.
|
|
type NotAvailableImageService struct{}
|
|
|
|
func (s *NotAvailableImageService) NewImage(_ context.Context, _ *models.AlertRule) (*models.Image, error) {
|
|
return nil, screenshot.ErrScreenshotsUnavailable
|
|
}
|
|
|
|
// NoopImageService is a no-op image service.
|
|
type NoopImageService struct{}
|
|
|
|
func (s *NoopImageService) NewImage(_ context.Context, _ *models.AlertRule) (*models.Image, error) {
|
|
return &models.Image{}, nil
|
|
}
|
|
|
|
// NoopSender is a no-op sender. Used when you want state manager to update LastSentAt without sending any alerts.
|
|
var NoopSender = func(_ context.Context, _ StateTransitions) {}
|
|
|
|
type CountingImageService struct {
|
|
mtx sync.Mutex
|
|
Called int
|
|
Image *models.Image
|
|
Err error
|
|
}
|
|
|
|
func (c *CountingImageService) NewImage(_ context.Context, _ *models.AlertRule) (*models.Image, error) {
|
|
c.mtx.Lock()
|
|
defer c.mtx.Unlock()
|
|
c.Called += 1
|
|
return c.Image, c.Err
|
|
}
|
|
|
|
func newSuccessfulCountingImageService() *CountingImageService {
|
|
return &CountingImageService{
|
|
Called: 0,
|
|
Image: &models.Image{
|
|
Token: fmt.Sprint(rand.Int()),
|
|
},
|
|
}
|
|
}
|
|
|
|
func NewFailingCountingImageService(err error) *CountingImageService {
|
|
return &CountingImageService{
|
|
Called: 0,
|
|
Err: err,
|
|
}
|
|
}
|