Files
grafana/pkg/services/cleanup/cleanup.go

421 lines
15 KiB
Go

package cleanup
import (
"context"
"errors"
"fmt"
"io/fs"
"os"
"path"
"strconv"
"time"
"go.opentelemetry.io/otel/attribute"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/dynamic"
"github.com/grafana/grafana/apps/shorturl/pkg/apis/shorturl/v1alpha1"
"github.com/grafana/grafana/pkg/apimachinery/identity"
"github.com/grafana/grafana/pkg/infra/db"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/serverlock"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/annotations"
grafanaapiserver "github.com/grafana/grafana/pkg/services/apiserver"
"github.com/grafana/grafana/pkg/services/apiserver/endpoints/request"
"github.com/grafana/grafana/pkg/services/dashboardsnapshots"
dashver "github.com/grafana/grafana/pkg/services/dashboardversion"
"github.com/grafana/grafana/pkg/services/featuremgmt"
"github.com/grafana/grafana/pkg/services/ngalert/image"
"github.com/grafana/grafana/pkg/services/org"
"github.com/grafana/grafana/pkg/services/queryhistory"
"github.com/grafana/grafana/pkg/services/shorturls"
tempuser "github.com/grafana/grafana/pkg/services/temp_user"
"github.com/grafana/grafana/pkg/setting"
)
type AlertRuleService interface {
CleanUpDeletedAlertRules(ctx context.Context) (int64, error)
}
type CleanUpService struct {
log log.Logger
tracer tracing.Tracer
store db.DB
Cfg *setting.Cfg
Features featuremgmt.FeatureToggles
ServerLockService *serverlock.ServerLockService
ShortURLService shorturls.Service
QueryHistoryService queryhistory.Service
dashboardVersionService dashver.Service
dashboardSnapshotService dashboardsnapshots.Service
deleteExpiredImageService *image.DeleteExpiredService
tempUserService tempuser.Service
annotationCleaner annotations.Cleaner
alertRuleService AlertRuleService
clientConfigProvider grafanaapiserver.RestConfigProvider
orgService org.Service
}
func ProvideService(cfg *setting.Cfg, Features featuremgmt.FeatureToggles, serverLockService *serverlock.ServerLockService,
shortURLService shorturls.Service, sqlstore db.DB, queryHistoryService queryhistory.Service,
dashboardVersionService dashver.Service, dashSnapSvc dashboardsnapshots.Service, deleteExpiredImageService *image.DeleteExpiredService,
tempUserService tempuser.Service, tracer tracing.Tracer, annotationCleaner annotations.Cleaner, service AlertRuleService, clientConfigProvider grafanaapiserver.RestConfigProvider, orgService org.Service) *CleanUpService {
s := &CleanUpService{
Cfg: cfg,
Features: Features,
ServerLockService: serverLockService,
ShortURLService: shortURLService,
QueryHistoryService: queryHistoryService,
store: sqlstore,
log: log.New("cleanup"),
dashboardVersionService: dashboardVersionService,
dashboardSnapshotService: dashSnapSvc,
deleteExpiredImageService: deleteExpiredImageService,
tempUserService: tempUserService,
tracer: tracer,
annotationCleaner: annotationCleaner,
alertRuleService: service,
clientConfigProvider: clientConfigProvider,
orgService: orgService,
}
return s
}
type cleanUpJob struct {
name string
fn func(context.Context)
}
func (j cleanUpJob) String() string {
return strconv.Quote(j.name)
}
func (srv *CleanUpService) Run(ctx context.Context) error {
srv.cleanUpTmpFiles(ctx)
ticker := time.NewTicker(time.Minute * 10)
for {
select {
case <-ticker.C:
srv.clean(ctx)
case <-ctx.Done():
return ctx.Err()
}
}
}
func (srv *CleanUpService) clean(ctx context.Context) {
const timeout = 9 * time.Minute
start := time.Now()
ctx, span := srv.tracer.Start(ctx, "cleanup background job")
defer span.End()
ctx, cancelFn := context.WithTimeout(ctx, timeout)
defer cancelFn()
cleanupJobs := []cleanUpJob{
{"clean up temporary files", srv.cleanUpTmpFiles},
{"delete expired snapshots", srv.deleteExpiredSnapshots},
{"delete expired dashboard versions", srv.deleteExpiredDashboardVersions},
{"delete expired images", srv.deleteExpiredImages},
{"cleanup old annotations", srv.cleanUpOldAnnotations},
{"expire old user invites", srv.expireOldUserInvites},
{"delete stale query history", srv.deleteStaleQueryHistory},
{"expire old email verifications", srv.expireOldVerifications},
}
if srv.Cfg.ShortLinkExpiration > 0 {
cleanupJobs = append(cleanupJobs, cleanUpJob{"delete stale short URLs", srv.deleteStaleShortURLs})
}
if srv.Cfg.UnifiedAlerting.DeletedRuleRetention > 0 {
cleanupJobs = append(cleanupJobs, cleanUpJob{"cleanup trash alert rules", srv.cleanUpTrashAlertRules})
}
logger := srv.log.FromContext(ctx)
logger.Debug("Starting cleanup jobs", "jobs", fmt.Sprintf("%v", cleanupJobs))
for _, j := range cleanupJobs {
if ctx.Err() != nil {
logger.Error("Cancelled cleanup job", "error", ctx.Err(), "duration", time.Since(start))
return
}
ctx, span := srv.tracer.Start(ctx, j.name)
j.fn(ctx)
span.End()
}
logger.Info("Completed cleanup jobs", "duration", time.Since(start))
}
func (srv *CleanUpService) cleanUpOldAnnotations(ctx context.Context) {
logger := srv.log.FromContext(ctx)
affected, affectedTags, err := srv.annotationCleaner.Run(ctx, srv.Cfg)
if err != nil && !errors.Is(err, context.DeadlineExceeded) {
logger.Error("failed to clean up old annotations", "error", err)
} else {
logger.Debug("Deleted excess annotations", "annotations affected", affected, "annotation tags affected", affectedTags)
}
}
func (srv *CleanUpService) cleanUpTmpFiles(ctx context.Context) {
folders := []string{
srv.Cfg.ImagesDir,
srv.Cfg.CSVsDir,
srv.Cfg.PDFsDir,
}
for _, f := range folders {
ctx, span := srv.tracer.Start(ctx, "delete stale files in temporary directory")
span.SetAttributes(attribute.String("directory", f))
srv.cleanUpTmpFolder(ctx, f)
span.End()
}
}
func (srv *CleanUpService) cleanUpTmpFolder(ctx context.Context, folder string) {
logger := srv.log.FromContext(ctx)
if _, err := os.Stat(folder); os.IsNotExist(err) {
return
}
files, err := os.ReadDir(folder)
if err != nil {
logger.Error("Problem reading dir", "folder", folder, "error", err)
return
}
var toDelete []fs.DirEntry
var now = time.Now()
for _, file := range files {
info, err := file.Info()
if err != nil {
logger.Error("Problem reading file", "folder", folder, "file", file, "error", err)
continue
}
if srv.shouldCleanupTempFile(info.ModTime(), now) {
toDelete = append(toDelete, file)
}
}
for _, file := range toDelete {
fullPath := path.Join(folder, file.Name())
err := os.Remove(fullPath)
if err != nil {
logger.Error("Failed to delete temp file", "file", file.Name(), "error", err)
}
}
logger.Debug("Found old rendered file to delete", "folder", folder, "deleted", len(toDelete), "kept", len(files))
}
func (srv *CleanUpService) shouldCleanupTempFile(filemtime time.Time, now time.Time) bool {
if srv.Cfg.TempDataLifetime == 0 {
return false
}
return filemtime.Add(srv.Cfg.TempDataLifetime).Before(now)
}
func (srv *CleanUpService) deleteExpiredSnapshots(ctx context.Context) {
logger := srv.log.FromContext(ctx)
cmd := dashboardsnapshots.DeleteExpiredSnapshotsCommand{}
if err := srv.dashboardSnapshotService.DeleteExpiredSnapshots(ctx, &cmd); err != nil {
logger.Error("Failed to delete expired snapshots", "error", err.Error())
} else {
logger.Debug("Deleted expired snapshots", "rows affected", cmd.DeletedRows)
}
}
func (srv *CleanUpService) deleteExpiredDashboardVersions(ctx context.Context) {
logger := srv.log.FromContext(ctx)
cmd := dashver.DeleteExpiredVersionsCommand{}
if err := srv.dashboardVersionService.DeleteExpired(ctx, &cmd); err != nil {
logger.Error("Failed to delete expired dashboard versions", "error", err.Error())
} else {
logger.Debug("Deleted old/expired dashboard versions", "rows affected", cmd.DeletedRows)
}
}
func (srv *CleanUpService) deleteExpiredImages(ctx context.Context) {
logger := srv.log.FromContext(ctx)
if !srv.Cfg.UnifiedAlerting.IsEnabled() {
return
}
if rowsAffected, err := srv.deleteExpiredImageService.DeleteExpired(ctx); err != nil {
logger.Error("Failed to delete expired images", "error", err.Error())
} else {
logger.Debug("Deleted expired images", "rows affected", rowsAffected)
}
}
func (srv *CleanUpService) expireOldUserInvites(ctx context.Context) {
logger := srv.log.FromContext(ctx)
maxInviteLifetime := srv.Cfg.UserInviteMaxLifetime
cmd := tempuser.ExpireTempUsersCommand{
OlderThan: time.Now().Add(-maxInviteLifetime),
}
if err := srv.tempUserService.ExpireOldUserInvites(ctx, &cmd); err != nil {
logger.Error("Problem expiring user invites", "error", err.Error())
} else {
logger.Debug("Expired user invites", "rows affected", cmd.NumExpired)
}
}
func (srv *CleanUpService) expireOldVerifications(ctx context.Context) {
logger := srv.log.FromContext(ctx)
maxVerificationLifetime := srv.Cfg.VerificationEmailMaxLifetime
cmd := tempuser.ExpireTempUsersCommand{
OlderThan: time.Now().Add(-maxVerificationLifetime),
}
if err := srv.tempUserService.ExpireOldVerifications(ctx, &cmd); err != nil {
logger.Error("Problem expiring email verifications", "error", err.Error())
} else {
logger.Debug("Expired email verifications", "rows affected", cmd.NumExpired)
}
}
func (srv *CleanUpService) deleteStaleShortURLs(ctx context.Context) {
logger := srv.log.FromContext(ctx)
if srv.Features.IsEnabledGlobally(featuremgmt.FlagKubernetesShortURLs) {
srv.deleteStaleKubernetesShortURLs(ctx)
} else {
cmd := shorturls.DeleteShortUrlCommand{
OlderThan: time.Now().Add(-time.Duration(srv.Cfg.ShortLinkExpiration*24) * time.Hour),
}
if err := srv.ShortURLService.DeleteStaleShortURLs(ctx, &cmd); err != nil {
logger.Error("Problem deleting stale short urls", "error", err.Error())
} else {
logger.Debug("Deleted short urls", "rows affected", cmd.NumDeleted)
}
}
}
func (srv *CleanUpService) deleteStaleKubernetesShortURLs(ctx context.Context) {
logger := srv.log.FromContext(ctx)
logger.Debug("Starting deleting expired Kubernetes shortURLs")
// Create the dynamic client for Kubernetes API
restConfig, err := srv.clientConfigProvider.GetRestConfig(ctx)
if err != nil {
logger.Error("Failed to get REST config for Kubernetes client", "error", err.Error())
return
}
client, err := dynamic.NewForConfig(restConfig)
if err != nil {
logger.Error("Failed to create Kubernetes client", "error", err.Error())
return
}
// Set up the GroupVersionResource for shortURLs
gvr := schema.GroupVersionResource{
Group: v1alpha1.ShortURLKind().Group(),
Version: v1alpha1.ShortURLKind().Version(),
Resource: v1alpha1.ShortURLKind().Plural(),
}
// Calculate the expiration time
expirationTime := time.Now().Add(-time.Duration(srv.Cfg.ShortLinkExpiration*24) * time.Hour)
expirationTimestamp := expirationTime.Unix()
deletedCount := 0
// List and delete expired shortURLs across all namespaces
orgs, err := srv.orgService.Search(ctx, &org.SearchOrgsQuery{})
if err != nil {
logger.Error("Failed to list organizations", "error", err.Error())
return
}
for _, o := range orgs {
ctx, _ := identity.WithServiceIdentity(ctx, o.ID)
namespaceMapper := request.GetNamespaceMapper(srv.Cfg)
shortURLs, err := client.Resource(gvr).Namespace(namespaceMapper(o.ID)).List(ctx, v1.ListOptions{})
if err != nil {
logger.Error("Failed to list shortURLs", "error", err.Error())
return
}
// Check each shortURL for expiration
for _, item := range shortURLs.Items {
// Convert unstructured object to ShortURL struct
var shortURL v1alpha1.ShortURL
err := runtime.DefaultUnstructuredConverter.FromUnstructured(item.Object, &shortURL)
if err != nil {
logger.Error("Failed to convert unstructured object to ShortURL", "name", item.GetName(), "namespace", item.GetNamespace(), "error", err.Error())
continue
}
// Only delete if lastSeenAt is 0 (meaning it has not been accessed) and the creation time is older than the expiration time
if shortURL.Status.LastSeenAt == 0 && shortURL.CreationTimestamp.Unix() < expirationTimestamp {
namespace := shortURL.Namespace
name := shortURL.Name
err := client.Resource(gvr).Namespace(namespace).Delete(ctx, name, v1.DeleteOptions{})
if err != nil {
// Check if it's a "not found" error, which is expected if the resource was already deleted
if k8serrors.IsNotFound(err) {
logger.Debug("ShortURL already deleted", "name", name, "namespace", namespace)
} else {
logger.Error("Failed to delete expired shortURL", "name", name, "namespace", namespace, "error", err.Error())
}
} else {
deletedCount++
logger.Debug("Successfully deleted expired shortURL", "name", name, "namespace", namespace, "creationTime", shortURL.CreationTimestamp.Unix(), "expirationTime", expirationTimestamp)
}
}
}
}
logger.Debug("Deleted expired Kubernetes shortURLs", "count", deletedCount)
}
func (srv *CleanUpService) deleteStaleQueryHistory(ctx context.Context) {
logger := srv.log.FromContext(ctx)
// Delete query history from 14+ days ago with exception of starred queries
maxQueryHistoryLifetime := time.Hour * 24 * 14
olderThan := time.Now().Add(-maxQueryHistoryLifetime).Unix()
rowsCount, err := srv.QueryHistoryService.DeleteStaleQueriesInQueryHistory(ctx, olderThan)
if err != nil {
logger.Error("Problem deleting stale query history", "error", err.Error())
} else {
logger.Debug("Deleted stale query history", "rows affected", rowsCount)
}
// Enforce 200k limit for query_history table
queryHistoryLimit := 200000
rowsCount, err = srv.QueryHistoryService.EnforceRowLimitInQueryHistory(ctx, queryHistoryLimit, false)
if err != nil {
logger.Error("Problem with enforcing row limit for query_history", "error", err.Error())
} else {
logger.Debug("Enforced row limit for query_history", "rows affected", rowsCount)
}
// Enforce 150k limit for query_history_star table
queryHistoryStarLimit := 150000
rowsCount, err = srv.QueryHistoryService.EnforceRowLimitInQueryHistory(ctx, queryHistoryStarLimit, true)
if err != nil {
logger.Error("Problem with enforcing row limit for query_history_star", "error", err.Error())
} else {
logger.Debug("Enforced row limit for query_history_star", "rows affected", rowsCount)
}
}
func (srv *CleanUpService) cleanUpTrashAlertRules(ctx context.Context) {
logger := srv.log.FromContext(ctx)
affected, err := srv.alertRuleService.CleanUpDeletedAlertRules(ctx)
if err != nil {
logger.Error("Problem cleaning up deleted alert rules", "error", err)
} else {
logger.Debug("Cleaned up deleted alert rules", "rows affected", affected)
}
}