Files
grafana/pkg/registry/apis/provisioning/webhooks/webhook.go
T
Roberto Jiménez Sánchez cdc6a6114c Provisioning: Improve logging and tracing in job processing (#113454)
* Provisioning: Improve logging and tracing in job processing

- Add comprehensive tracing with OpenTelemetry spans across all job operations
- Enhance logging with consistent style: lowercase, concise messages, appropriate log levels
- Use past tense for completed lifecycle events (e.g., 'stopped' vs 'stop')
- Add structured logging with contextual attributes for better searchability
- Handle graceful shutdowns without throwing errors on context cancellation
- Refactor Cleanup method into listExpiredJobs and cleanUpExpiredJob for better code quality
- Avoid double logging by only logging errors when handled locally
- Add tracing and logging to historyjob controller cleanup operations

Files modified:
- pkg/registry/apis/provisioning/jobs/driver.go: Add tracing spans and improve error handling for graceful shutdown
- pkg/registry/apis/provisioning/jobs/concurrent_driver.go: Add tracing and consistent logging
- pkg/registry/apis/provisioning/jobs/persistentstore.go: Add comprehensive tracing and logging to all public methods, refactor cleanup
- apps/provisioning/pkg/controller/historyjob.go: Add tracing and improve logging consistency

* Update pkg/registry/apis/provisioning/jobs/persistentstore.go

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Refactor logging in persistentstore.go

- Remove debug log statements at the start of job operations for cleaner output
- Maintain structured logging with contextual attributes for improved traceability

Files modified:
- pkg/registry/apis/provisioning/jobs/persistentstore.go: Clean up logging for job operations

* Enhance logging and tracing in provisioning job operations

- Introduce OpenTelemetry spans for better observability in job processing and webhook handling
- Improve structured logging with contextual attributes for key operations
- Remove unnecessary tracing spans in long-running functions to streamline performance
- Update error handling to record errors in spans for better traceability

Files modified:
- pkg/registry/apis/provisioning/controller/repository.go: Add tracing and structured logging to sync job operations
- pkg/registry/apis/provisioning/jobs/concurrent_driver.go: Remove tracing span from long-running function
- pkg/registry/apis/provisioning/jobs/driver.go: Enhance logging and tracing in job processing
- pkg/registry/apis/provisioning/webhooks/webhook.go: Implement tracing and structured logging for webhook connections

* Update pkg/registry/apis/provisioning/jobs/driver.go

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Improve error handling in ConcurrentJobDriver to differentiate between graceful shutdown and unexpected stops

* Remove unused import in driver.go to clean up code

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-11-12 14:59:27 +01:00

230 lines
7.1 KiB
Go

package webhooks
import (
"context"
"fmt"
"net/http"
"time"
"go.opentelemetry.io/otel/attribute"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apiserver/pkg/authorization/authorizer"
"k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/apiserver/pkg/registry/rest"
"k8s.io/kube-openapi/pkg/spec3"
"github.com/grafana/grafana-app-sdk/logging"
provisioning "github.com/grafana/grafana/apps/provisioning/pkg/apis/provisioning/v0alpha1"
"github.com/grafana/grafana/apps/provisioning/pkg/repository"
"github.com/grafana/grafana/pkg/apimachinery/identity"
"github.com/grafana/grafana/pkg/infra/tracing"
provisioningapis "github.com/grafana/grafana/pkg/registry/apis/provisioning"
"github.com/grafana/grafana/pkg/registry/apis/provisioning/webhooks/pullrequest"
"github.com/prometheus/client_golang/prometheus"
)
type WebhookRepository interface {
Webhook(ctx context.Context, req *http.Request) (*provisioning.WebhookResponse, error)
}
// Webhook endpoint max size (25MB)
// See https://docs.github.com/en/webhooks/webhook-events-and-payloads
const webhookMaxBodySize = 25 * 1024 * 1024
// This only works for github right now
type webhookConnector struct {
webhooksEnabled bool
core *provisioningapis.APIBuilder
renderer pullrequest.ScreenshotRenderer
registry prometheus.Registerer
metrics webhookMetrics
}
func NewWebhookConnector(
webhooksEnabled bool,
// TODO: use interface for this
core *provisioningapis.APIBuilder,
renderer pullrequest.ScreenshotRenderer,
registry prometheus.Registerer,
) *webhookConnector {
metrics := registerWebhookMetrics(registry)
return &webhookConnector{
webhooksEnabled: webhooksEnabled,
core: core,
renderer: renderer,
registry: registry,
metrics: metrics,
}
}
func (*webhookConnector) New() runtime.Object {
return &provisioning.WebhookResponse{}
}
func (*webhookConnector) Destroy() {}
func (*webhookConnector) ProducesMIMETypes(verb string) []string {
return []string{"application/json"}
}
func (*webhookConnector) ProducesObject(verb string) any {
return &provisioning.WebhookResponse{}
}
func (*webhookConnector) ConnectMethods() []string {
return []string{
http.MethodPost,
http.MethodGet, // only useful for browser testing, should be removed
}
}
func (*webhookConnector) NewConnectOptions() (runtime.Object, bool, string) {
return nil, false, ""
}
func (s *webhookConnector) Authorize(ctx context.Context, a authorizer.Attributes) (decision authorizer.Decision, reason string, err error) {
if provisioning.RepositoryResourceInfo.GetName() == a.GetResource() && a.GetSubresource() == "webhook" {
// When the resource is a webhook, we'll deal with permissions manually by checking signatures or similar in the webhook handler.
// The user in this context is usually an anonymous user, but may also be an authenticated synthetic check by the Grafana instance's operator as well.
// For context on the anonymous user, check the authn/clients/provisioning.go file.
return authorizer.DecisionAllow, "", nil
}
return authorizer.DecisionNoOpinion, "", nil
}
func (s *webhookConnector) UpdateStorage(storage map[string]rest.Storage) error {
storage[provisioning.RepositoryResourceInfo.StoragePath("webhook")] = s
return nil
}
func (s *webhookConnector) PostProcessOpenAPI(oas *spec3.OpenAPI) error {
root := "/apis/" + s.core.GetGroupVersion().String() + "/"
repoprefix := root + "namespaces/{namespace}/repositories/{name}"
sub := oas.Paths.Paths[repoprefix+"/webhook"]
if sub != nil && sub.Get != nil {
sub.Post.Description = "Currently only supports github webhooks"
}
return nil
}
func (s *webhookConnector) Connect(ctx context.Context, name string, opts runtime.Object, responder rest.Responder) (http.Handler, error) {
namespace := request.NamespaceValue(ctx)
ctx, _, err := identity.WithProvisioningIdentity(ctx, namespace)
if err != nil {
return nil, err
}
// Get the repository with the worker identity (since the request user is likely anonymous)
repo, err := s.core.GetRepository(ctx, name)
if err != nil {
return nil, err
}
return provisioningapis.WithTimeout(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
ctx, span := tracing.Start(r.Context(), "provisioning.webhook.handle")
defer span.End()
span.SetAttributes(
attribute.String("repository", name),
attribute.String("namespace", namespace),
)
logger := logging.FromContext(ctx).With("logger", "webhook-connector", "repo", name)
ctx = logging.Context(ctx, logger)
if !s.webhooksEnabled {
responder.Error(errors.NewBadRequest("webhooks are not enabled"))
return
}
hooks, ok := repo.(WebhookRepository)
if !ok {
responder.Error(errors.NewBadRequest("the repository does not support webhooks"))
return
}
// Limit the webhook request body size
r.Body = http.MaxBytesReader(w, r.Body, webhookMaxBodySize)
rsp, err := hooks.Webhook(ctx, r)
if err != nil {
span.RecordError(err)
responder.Error(err)
return
}
if rsp == nil {
err := fmt.Errorf("expecting a response")
span.RecordError(err)
responder.Error(err)
return
}
if err := s.updateLastEvent(ctx, repo); err != nil {
// Continue processing as this is non-critical; the update is purely informational
logger.Error("failed to update last event", "error", err)
}
actionTaken := "none"
defer func() {
s.metrics.recordEventProcessed(actionTaken)
}()
if rsp.Job != nil {
rsp.Job.Repository = name
actionTaken = string(rsp.Job.Action)
span.SetAttributes(attribute.String("job.action", actionTaken))
job, err := s.core.GetJobQueue().Insert(ctx, namespace, *rsp.Job)
if err != nil {
span.RecordError(err)
logger.Error("failed to insert job", "error", err)
responder.Error(err)
return
}
span.SetAttributes(attribute.String("job.name", job.Name))
logger.Info("webhook job created", "job", job.Name, "action", actionTaken)
responder.Object(rsp.Code, job)
return
}
responder.Object(rsp.Code, rsp)
}), 30*time.Second), nil
}
// updateLastEvent updates the last event time for the webhook
// This is to provide some visibility that the webhook is still active and working
// It's not a good idea to update the webhook status too often, so we only update it if it's been a while
func (s *webhookConnector) updateLastEvent(ctx context.Context, repo repository.Repository) error {
patcher := s.core.GetStatusPatcher()
if patcher == nil {
// This would only happen if we wired things up incorrectly
return fmt.Errorf("status patcher is nil")
}
lastEvent := time.UnixMilli(repo.Config().Status.Webhook.LastEvent)
eventAge := time.Since(lastEvent)
if repo.Config().Status.Webhook != nil && (eventAge > time.Minute) {
patchOp := map[string]any{
"op": "replace",
"path": "/status/webhook/lastEvent",
"value": time.Now().UnixMilli(),
}
if err := patcher.Patch(ctx, repo.Config(), patchOp); err != nil {
return fmt.Errorf("patch status: %w", err)
}
}
return nil
}
var (
_ rest.Storage = (*webhookConnector)(nil)
_ rest.Connecter = (*webhookConnector)(nil)
_ rest.StorageMetadata = (*webhookConnector)(nil)
)