Files
grafana/pkg/services/ngalert/migration/service.go
T
Matthew Jacobson c2efcdde09 Alerting: Fix flaky SQLITE_BUSY when migrating with provisioned dashboards (#76658)
* Alerting: Move migration from background service run to ngalert init

sqlite database write contention between the migration's single transaction and
dashboard provisioning's frequent commits was causing the migration to
 fail with SQLITE_BUSY/SQLITE_BUSY_SNAPSHOT on all retries.

 This is not a new issue for sqlite+grafana, but the discrepancy between the
 length of  the transactions was causing it to be very consistent. In addition,
 since a failed migration has implications on the assumed correctness of the
 alertmanager and alert rule definition state, we cause a server shutdown on
 error. This can make e2e tests as well as some high-load provisioned
 sqlite installations flaky on startup.

 The correct fix for this is better transaction management across various
 services and is out of scope for this change as we're primarily interested in
 mitigating the current bout of server failures in e2e tests when using sqlite.
2023-10-19 10:03:00 -04:00

137 lines
4.1 KiB
Go

package migration
import (
"context"
"fmt"
"time"
"github.com/grafana/grafana/pkg/infra/db"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/serverlock"
migrationStore "github.com/grafana/grafana/pkg/services/ngalert/migration/store"
"github.com/grafana/grafana/pkg/services/secrets"
"github.com/grafana/grafana/pkg/setting"
)
// actionName is the unique row-level lock name for serverlock.ServerLockService.
const actionName = "alerting migration"
//nolint:stylecheck
var ForceMigrationError = fmt.Errorf("Grafana has already been migrated to Unified Alerting. Any alert rules created while using Unified Alerting will be deleted by rolling back. Set force_migration=true in your grafana.ini and restart Grafana to roll back and delete Unified Alerting configuration data.")
type UpgradeService interface {
Run(ctx context.Context) error
}
type migrationService struct {
lock *serverlock.ServerLockService
cfg *setting.Cfg
log log.Logger
store db.DB
migrationStore migrationStore.Store
encryptionService secrets.Service
}
func ProvideService(
lock *serverlock.ServerLockService,
cfg *setting.Cfg,
store db.DB,
migrationStore migrationStore.Store,
encryptionService secrets.Service,
) (UpgradeService, error) {
return &migrationService{
lock: lock,
log: log.New("ngalert.migration"),
cfg: cfg,
store: store,
migrationStore: migrationStore,
encryptionService: encryptionService,
}, nil
}
// Run starts the migration. This will either migrate from legacy alerting to unified alerting or revert the migration.
// If the migration status in the kvstore is not set and unified alerting is enabled, the migration will be executed.
// If the migration status in the kvstore is set and both unified alerting is disabled and ForceMigration is set to true, the migration will be reverted.
func (ms *migrationService) Run(ctx context.Context) error {
var errMigration error
errLock := ms.lock.LockExecuteAndRelease(ctx, actionName, time.Minute*10, func(ctx context.Context) {
ms.log.Info("Starting")
errMigration = ms.store.InTransaction(ctx, func(ctx context.Context) error {
migrated, err := ms.migrationStore.IsMigrated(ctx)
if err != nil {
return fmt.Errorf("getting migration status: %w", err)
}
if migrated == ms.cfg.UnifiedAlerting.IsEnabled() {
// Nothing to do.
ms.log.Info("No migrations to run")
return nil
}
if migrated {
// If legacy alerting is also disabled, there is nothing to do
if setting.AlertingEnabled != nil && !*setting.AlertingEnabled {
return nil
}
// Safeguard to prevent data loss when reverting from UA to LA.
if !ms.cfg.ForceMigration {
return ForceMigrationError
}
// Revert migration
ms.log.Info("Reverting legacy migration")
err := ms.migrationStore.RevertAllOrgs(ctx)
if err != nil {
return fmt.Errorf("reverting migration: %w", err)
}
ms.log.Info("Legacy migration reverted")
return nil
}
ms.log.Info("Starting legacy migration")
err = ms.migrateAllOrgs(ctx)
if err != nil {
return fmt.Errorf("executing migration: %w", err)
}
err = ms.migrationStore.SetMigrated(ctx, true)
if err != nil {
return fmt.Errorf("setting migration status: %w", err)
}
ms.log.Info("Completed legacy migration")
return nil
})
})
if errLock != nil {
ms.log.Warn("Server lock for alerting migration already exists")
return nil
}
if errMigration != nil {
return fmt.Errorf("migration failed: %w", errMigration)
}
return nil
}
// migrateAllOrgs executes the migration for all orgs.
func (ms *migrationService) migrateAllOrgs(ctx context.Context) error {
orgs, err := ms.migrationStore.GetAllOrgs(ctx)
if err != nil {
return fmt.Errorf("get orgs: %w", err)
}
for _, o := range orgs {
om := ms.newOrgMigration(o.ID)
if err := om.migrateOrg(ctx); err != nil {
return fmt.Errorf("migrate org %d: %w", o.ID, err)
}
err = om.migrationStore.SetOrgMigrationState(ctx, o.ID, om.state)
if err != nil {
return fmt.Errorf("set org migration state: %w", err)
}
}
return nil
}