c2efcdde09
* Alerting: Move migration from background service run to ngalert init sqlite database write contention between the migration's single transaction and dashboard provisioning's frequent commits was causing the migration to fail with SQLITE_BUSY/SQLITE_BUSY_SNAPSHOT on all retries. This is not a new issue for sqlite+grafana, but the discrepancy between the length of the transactions was causing it to be very consistent. In addition, since a failed migration has implications on the assumed correctness of the alertmanager and alert rule definition state, we cause a server shutdown on error. This can make e2e tests as well as some high-load provisioned sqlite installations flaky on startup. The correct fix for this is better transaction management across various services and is out of scope for this change as we're primarily interested in mitigating the current bout of server failures in e2e tests when using sqlite.
137 lines
4.1 KiB
Go
137 lines
4.1 KiB
Go
package migration
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/grafana/grafana/pkg/infra/db"
|
|
"github.com/grafana/grafana/pkg/infra/log"
|
|
"github.com/grafana/grafana/pkg/infra/serverlock"
|
|
migrationStore "github.com/grafana/grafana/pkg/services/ngalert/migration/store"
|
|
"github.com/grafana/grafana/pkg/services/secrets"
|
|
"github.com/grafana/grafana/pkg/setting"
|
|
)
|
|
|
|
// actionName is the unique row-level lock name for serverlock.ServerLockService.
|
|
const actionName = "alerting migration"
|
|
|
|
//nolint:stylecheck
|
|
var ForceMigrationError = fmt.Errorf("Grafana has already been migrated to Unified Alerting. Any alert rules created while using Unified Alerting will be deleted by rolling back. Set force_migration=true in your grafana.ini and restart Grafana to roll back and delete Unified Alerting configuration data.")
|
|
|
|
type UpgradeService interface {
|
|
Run(ctx context.Context) error
|
|
}
|
|
|
|
type migrationService struct {
|
|
lock *serverlock.ServerLockService
|
|
cfg *setting.Cfg
|
|
log log.Logger
|
|
store db.DB
|
|
migrationStore migrationStore.Store
|
|
|
|
encryptionService secrets.Service
|
|
}
|
|
|
|
func ProvideService(
|
|
lock *serverlock.ServerLockService,
|
|
cfg *setting.Cfg,
|
|
store db.DB,
|
|
migrationStore migrationStore.Store,
|
|
encryptionService secrets.Service,
|
|
) (UpgradeService, error) {
|
|
return &migrationService{
|
|
lock: lock,
|
|
log: log.New("ngalert.migration"),
|
|
cfg: cfg,
|
|
store: store,
|
|
migrationStore: migrationStore,
|
|
encryptionService: encryptionService,
|
|
}, nil
|
|
}
|
|
|
|
// Run starts the migration. This will either migrate from legacy alerting to unified alerting or revert the migration.
|
|
// If the migration status in the kvstore is not set and unified alerting is enabled, the migration will be executed.
|
|
// If the migration status in the kvstore is set and both unified alerting is disabled and ForceMigration is set to true, the migration will be reverted.
|
|
func (ms *migrationService) Run(ctx context.Context) error {
|
|
var errMigration error
|
|
errLock := ms.lock.LockExecuteAndRelease(ctx, actionName, time.Minute*10, func(ctx context.Context) {
|
|
ms.log.Info("Starting")
|
|
errMigration = ms.store.InTransaction(ctx, func(ctx context.Context) error {
|
|
migrated, err := ms.migrationStore.IsMigrated(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("getting migration status: %w", err)
|
|
}
|
|
if migrated == ms.cfg.UnifiedAlerting.IsEnabled() {
|
|
// Nothing to do.
|
|
ms.log.Info("No migrations to run")
|
|
return nil
|
|
}
|
|
|
|
if migrated {
|
|
// If legacy alerting is also disabled, there is nothing to do
|
|
if setting.AlertingEnabled != nil && !*setting.AlertingEnabled {
|
|
return nil
|
|
}
|
|
|
|
// Safeguard to prevent data loss when reverting from UA to LA.
|
|
if !ms.cfg.ForceMigration {
|
|
return ForceMigrationError
|
|
}
|
|
|
|
// Revert migration
|
|
ms.log.Info("Reverting legacy migration")
|
|
err := ms.migrationStore.RevertAllOrgs(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("reverting migration: %w", err)
|
|
}
|
|
ms.log.Info("Legacy migration reverted")
|
|
return nil
|
|
}
|
|
|
|
ms.log.Info("Starting legacy migration")
|
|
err = ms.migrateAllOrgs(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("executing migration: %w", err)
|
|
}
|
|
|
|
err = ms.migrationStore.SetMigrated(ctx, true)
|
|
if err != nil {
|
|
return fmt.Errorf("setting migration status: %w", err)
|
|
}
|
|
|
|
ms.log.Info("Completed legacy migration")
|
|
return nil
|
|
})
|
|
})
|
|
if errLock != nil {
|
|
ms.log.Warn("Server lock for alerting migration already exists")
|
|
return nil
|
|
}
|
|
if errMigration != nil {
|
|
return fmt.Errorf("migration failed: %w", errMigration)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// migrateAllOrgs executes the migration for all orgs.
|
|
func (ms *migrationService) migrateAllOrgs(ctx context.Context) error {
|
|
orgs, err := ms.migrationStore.GetAllOrgs(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("get orgs: %w", err)
|
|
}
|
|
|
|
for _, o := range orgs {
|
|
om := ms.newOrgMigration(o.ID)
|
|
if err := om.migrateOrg(ctx); err != nil {
|
|
return fmt.Errorf("migrate org %d: %w", o.ID, err)
|
|
}
|
|
|
|
err = om.migrationStore.SetOrgMigrationState(ctx, o.ID, om.state)
|
|
if err != nil {
|
|
return fmt.Errorf("set org migration state: %w", err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|