Files
grafana/apps/provisioning/pkg/controller/historyjob.go
T
Roberto Jiménez Sánchez cdc6a6114c Provisioning: Improve logging and tracing in job processing (#113454)
* Provisioning: Improve logging and tracing in job processing

- Add comprehensive tracing with OpenTelemetry spans across all job operations
- Enhance logging with consistent style: lowercase, concise messages, appropriate log levels
- Use past tense for completed lifecycle events (e.g., 'stopped' vs 'stop')
- Add structured logging with contextual attributes for better searchability
- Handle graceful shutdowns without throwing errors on context cancellation
- Refactor Cleanup method into listExpiredJobs and cleanUpExpiredJob for better code quality
- Avoid double logging by only logging errors when handled locally
- Add tracing and logging to historyjob controller cleanup operations

Files modified:
- pkg/registry/apis/provisioning/jobs/driver.go: Add tracing spans and improve error handling for graceful shutdown
- pkg/registry/apis/provisioning/jobs/concurrent_driver.go: Add tracing and consistent logging
- pkg/registry/apis/provisioning/jobs/persistentstore.go: Add comprehensive tracing and logging to all public methods, refactor cleanup
- apps/provisioning/pkg/controller/historyjob.go: Add tracing and improve logging consistency

* Update pkg/registry/apis/provisioning/jobs/persistentstore.go

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Refactor logging in persistentstore.go

- Remove debug log statements at the start of job operations for cleaner output
- Maintain structured logging with contextual attributes for improved traceability

Files modified:
- pkg/registry/apis/provisioning/jobs/persistentstore.go: Clean up logging for job operations

* Enhance logging and tracing in provisioning job operations

- Introduce OpenTelemetry spans for better observability in job processing and webhook handling
- Improve structured logging with contextual attributes for key operations
- Remove unnecessary tracing spans in long-running functions to streamline performance
- Update error handling to record errors in spans for better traceability

Files modified:
- pkg/registry/apis/provisioning/controller/repository.go: Add tracing and structured logging to sync job operations
- pkg/registry/apis/provisioning/jobs/concurrent_driver.go: Remove tracing span from long-running function
- pkg/registry/apis/provisioning/jobs/driver.go: Enhance logging and tracing in job processing
- pkg/registry/apis/provisioning/webhooks/webhook.go: Implement tracing and structured logging for webhook connections

* Update pkg/registry/apis/provisioning/jobs/driver.go

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Improve error handling in ConcurrentJobDriver to differentiate between graceful shutdown and unexpected stops

* Remove unused import in driver.go to clean up code

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-11-12 14:59:27 +01:00

100 lines
2.8 KiB
Go

package controller
import (
"context"
"time"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/client-go/tools/cache"
"github.com/grafana/grafana-app-sdk/logging"
provisioning "github.com/grafana/grafana/apps/provisioning/pkg/apis/provisioning/v0alpha1"
client "github.com/grafana/grafana/apps/provisioning/pkg/generated/clientset/versioned/typed/provisioning/v0alpha1"
informer "github.com/grafana/grafana/apps/provisioning/pkg/generated/informers/externalversions/provisioning/v0alpha1"
"github.com/grafana/grafana/pkg/apimachinery/identity"
)
const (
historyJobControllerLoggerName = "provisioning-historyjob-controller"
)
// HistoryJobController manages the cleanup of old HistoryJob entries.
type HistoryJobController struct {
client client.ProvisioningV0alpha1Interface
logger logging.Logger
expirationTime time.Duration
}
// NewHistoryJobController creates a new HistoryJobController.
func NewHistoryJobController(
provisioningClient client.ProvisioningV0alpha1Interface,
historyJobInformer informer.HistoricJobInformer,
expirationTime time.Duration,
) (*HistoryJobController, error) {
c := &HistoryJobController{
client: provisioningClient,
logger: logging.DefaultLogger.With("logger", historyJobControllerLoggerName),
expirationTime: expirationTime,
}
// Use the resync events from the shared informer to trigger cleanup for each job
_, err := historyJobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
c.cleanupJob(obj)
},
UpdateFunc: func(oldObj, newObj interface{}) {
c.cleanupJob(newObj)
},
})
if err != nil {
return nil, err
}
return c, nil
}
func (c *HistoryJobController) cleanupJob(obj interface{}) {
job, ok := obj.(*provisioning.HistoricJob)
if !ok {
c.logger.Error("unexpected object type - expected HistoricJob", "type", obj)
return
}
age := time.Since(job.CreationTimestamp.Time)
// Only cleanup jobs older than expiration time
if age <= c.expirationTime {
return
}
logger := c.logger.With(
"job", job.Name,
"namespace", job.Namespace,
"age", age,
)
logger.Debug("start cleanup expired historic job")
namespace := job.Namespace
ctx, _, err := identity.WithProvisioningIdentity(context.Background(), namespace)
if err != nil {
logger.Error("failed to set provisioning identity", "error", err)
return
}
ctx = request.WithNamespace(ctx, namespace)
err = c.client.HistoricJobs(job.Namespace).Delete(ctx, job.Name, metav1.DeleteOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
logger.Debug("historic job already deleted")
return
}
logger.Error("failed to delete expired historic job", "error", err)
return
}
logger.Info("deleted expired historic job")
}