CloudMigrations: Fix traceability & HTTP Client initialisation (#94141)
* Add traceability to Migration Assistant feature * Fix some compilation errors * Fix lint issues * Use async context * Add trace for LibraryElements
This commit is contained in:
committed by
GitHub
parent
9680722b78
commit
19c77eaae1
@@ -12,6 +12,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/backend/httpclient"
|
||||
"github.com/grafana/grafana/pkg/api/routing"
|
||||
"github.com/grafana/grafana/pkg/infra/db"
|
||||
"github.com/grafana/grafana/pkg/infra/kvstore"
|
||||
@@ -35,6 +36,7 @@ import (
|
||||
"github.com/grafana/grafana/pkg/util"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/codes"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
)
|
||||
|
||||
@@ -83,6 +85,7 @@ var _ cloudmigration.Service = (*Service)(nil)
|
||||
// builds the service, and api, and configures routes
|
||||
func ProvideService(
|
||||
cfg *setting.Cfg,
|
||||
httpClientProvider *httpclient.Provider,
|
||||
features featuremgmt.FeatureToggles,
|
||||
db db.DB,
|
||||
dsService datasources.DataSourceService,
|
||||
@@ -118,15 +121,29 @@ func ProvideService(
|
||||
}
|
||||
s.api = api.RegisterApi(routeRegister, s, tracer)
|
||||
|
||||
s.objectStorage = objectstorage.NewS3()
|
||||
httpClientS3, err := httpClientProvider.New()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating http client for S3: %w", err)
|
||||
}
|
||||
s.objectStorage = objectstorage.NewS3(httpClientS3, tracer)
|
||||
|
||||
if !cfg.CloudMigration.IsDeveloperMode {
|
||||
c, err := gmsclient.NewGMSClient(cfg)
|
||||
httpClientGMS, err := httpClientProvider.New()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating http client for GMS: %w", err)
|
||||
}
|
||||
|
||||
c, err := gmsclient.NewGMSClient(cfg, httpClientGMS)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("initializing GMS client: %w", err)
|
||||
}
|
||||
s.gmsClient = c
|
||||
s.gcomService = gcom.New(gcom.Config{ApiURL: cfg.GrafanaComAPIURL, Token: cfg.CloudMigration.GcomAPIToken})
|
||||
|
||||
httpClientGcom, err := httpClientProvider.New()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating http client for GCOM: %w", err)
|
||||
}
|
||||
s.gcomService = gcom.New(gcom.Config{ApiURL: cfg.GrafanaComAPIURL, Token: cfg.CloudMigration.GcomAPIToken}, httpClientGcom)
|
||||
} else {
|
||||
s.gmsClient = gmsclient.NewInMemoryClient()
|
||||
s.gcomService = &gcomStub{policies: map[string]gcom.AccessPolicy{}, token: nil}
|
||||
@@ -169,7 +186,8 @@ func (s *Service) GetToken(ctx context.Context) (gcom.TokenView, error) {
|
||||
RequestID: requestID,
|
||||
Region: instance.RegionSlug,
|
||||
AccessPolicyName: accessPolicyName,
|
||||
TokenName: accessTokenName})
|
||||
TokenName: accessTokenName,
|
||||
})
|
||||
if err != nil {
|
||||
return gcom.TokenView{}, fmt.Errorf("listing tokens: %w", err)
|
||||
}
|
||||
@@ -279,9 +297,6 @@ func (s *Service) CreateToken(ctx context.Context) (cloudmigration.CreateAccessT
|
||||
}
|
||||
|
||||
func (s *Service) findAccessPolicyByName(ctx context.Context, regionSlug, accessPolicyName string) (*gcom.AccessPolicy, error) {
|
||||
ctx, span := s.tracer.Start(ctx, "CloudMigrationService.findAccessPolicyByName")
|
||||
defer span.End()
|
||||
|
||||
accessPolicies, err := s.gcomService.ListAccessPolicies(ctx, gcom.ListAccessPoliciesParams{
|
||||
RequestID: tracing.TraceIDFromContext(ctx, false),
|
||||
Region: regionSlug,
|
||||
@@ -341,7 +356,7 @@ func (s *Service) DeleteToken(ctx context.Context, tokenID string) error {
|
||||
}
|
||||
|
||||
func (s *Service) GetSession(ctx context.Context, uid string) (*cloudmigration.CloudMigrationSession, error) {
|
||||
ctx, span := s.tracer.Start(ctx, "CloudMigrationService.GetMigration")
|
||||
ctx, span := s.tracer.Start(ctx, "CloudMigrationService.GetSession")
|
||||
defer span.End()
|
||||
migration, err := s.store.GetMigrationSessionByUID(ctx, uid)
|
||||
if err != nil {
|
||||
@@ -352,6 +367,9 @@ func (s *Service) GetSession(ctx context.Context, uid string) (*cloudmigration.C
|
||||
}
|
||||
|
||||
func (s *Service) GetSessionList(ctx context.Context) (*cloudmigration.CloudMigrationSessionListResponse, error) {
|
||||
ctx, span := s.tracer.Start(ctx, "CloudMigrationService.GetSessionList")
|
||||
defer span.End()
|
||||
|
||||
values, err := s.store.GetCloudMigrationSessionList(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("retrieving session list from store: %w", err)
|
||||
@@ -370,7 +388,7 @@ func (s *Service) GetSessionList(ctx context.Context) (*cloudmigration.CloudMigr
|
||||
}
|
||||
|
||||
func (s *Service) CreateSession(ctx context.Context, cmd cloudmigration.CloudMigrationSessionRequest) (*cloudmigration.CloudMigrationSessionResponse, error) {
|
||||
ctx, span := s.tracer.Start(ctx, "CloudMigrationService.createMigration")
|
||||
ctx, span := s.tracer.Start(ctx, "CloudMigrationService.CreateSession")
|
||||
defer span.End()
|
||||
|
||||
base64Token := cmd.AuthToken
|
||||
@@ -405,6 +423,9 @@ func (s *Service) CreateSession(ctx context.Context, cmd cloudmigration.CloudMig
|
||||
}
|
||||
|
||||
func (s *Service) DeleteSession(ctx context.Context, sessionUID string) (*cloudmigration.CloudMigrationSession, error) {
|
||||
ctx, span := s.tracer.Start(ctx, "CloudMigrationService.DeleteSession")
|
||||
defer span.End()
|
||||
|
||||
session, snapshots, err := s.store.DeleteMigrationSessionByUID(ctx, sessionUID)
|
||||
if err != nil {
|
||||
s.report(ctx, session, gmsclient.EventDisconnect, 0, err)
|
||||
@@ -470,26 +491,36 @@ func (s *Service) CreateSnapshot(ctx context.Context, signedInUser *user.SignedI
|
||||
s.cancelMutex.Unlock()
|
||||
}()
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
// Create context out the span context to ensure the trace is propagated
|
||||
asyncCtx := trace.ContextWithSpanContext(context.Background(), span.SpanContext())
|
||||
asyncCtx, asyncSpan := s.tracer.Start(asyncCtx, "CloudMigrationService.CreateSnapshotAsync")
|
||||
defer asyncSpan.End()
|
||||
|
||||
asyncCtx, cancelFunc := context.WithCancel(asyncCtx)
|
||||
s.cancelFunc = cancelFunc
|
||||
|
||||
s.report(ctx, session, gmsclient.EventStartBuildingSnapshot, 0, nil)
|
||||
s.report(asyncCtx, session, gmsclient.EventStartBuildingSnapshot, 0, nil)
|
||||
|
||||
start := time.Now()
|
||||
err := s.buildSnapshot(ctx, signedInUser, initResp.MaxItemsPerPartition, initResp.Metadata, snapshot)
|
||||
err := s.buildSnapshot(asyncCtx, signedInUser, initResp.MaxItemsPerPartition, initResp.Metadata, snapshot)
|
||||
if err != nil {
|
||||
asyncSpan.SetStatus(codes.Error, "error building snapshot")
|
||||
asyncSpan.RecordError(err)
|
||||
s.log.Error("building snapshot", "err", err.Error())
|
||||
|
||||
// Update status to error with retries
|
||||
if err := s.updateSnapshotWithRetries(context.Background(), cloudmigration.UpdateSnapshotCmd{
|
||||
if err := s.updateSnapshotWithRetries(asyncCtx, cloudmigration.UpdateSnapshotCmd{
|
||||
UID: snapshot.UID,
|
||||
SessionID: sessionUid,
|
||||
Status: cloudmigration.SnapshotStatusError,
|
||||
}); err != nil {
|
||||
s.log.Error("critical failure during snapshot creation - please report any error logs")
|
||||
asyncSpan.RecordError(err)
|
||||
}
|
||||
}
|
||||
|
||||
s.report(ctx, session, gmsclient.EventDoneBuildingSnapshot, time.Since(start), err)
|
||||
span.SetStatus(codes.Ok, "snapshot built")
|
||||
s.report(asyncCtx, session, gmsclient.EventDoneBuildingSnapshot, time.Since(start), err)
|
||||
}()
|
||||
|
||||
return &snapshot, nil
|
||||
@@ -624,32 +655,48 @@ func (s *Service) UploadSnapshot(ctx context.Context, sessionUid string, snapsho
|
||||
s.cancelMutex.Unlock()
|
||||
}()
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
s.cancelFunc = cancelFunc
|
||||
// Create context out the span context to ensure the trace is propagated
|
||||
asyncCtx := trace.ContextWithSpanContext(context.Background(), span.SpanContext())
|
||||
asyncCtx, asyncSpan := s.tracer.Start(asyncCtx, "CloudMigrationService.UploadSnapshot")
|
||||
defer asyncSpan.End()
|
||||
|
||||
s.report(ctx, session, gmsclient.EventStartUploadingSnapshot, 0, nil)
|
||||
asyncCtx, s.cancelFunc = context.WithCancel(asyncCtx)
|
||||
|
||||
s.report(asyncCtx, session, gmsclient.EventStartUploadingSnapshot, 0, nil)
|
||||
|
||||
start := time.Now()
|
||||
err := s.uploadSnapshot(ctx, session, snapshot, uploadUrl)
|
||||
err := s.uploadSnapshot(asyncCtx, session, snapshot, uploadUrl)
|
||||
if err != nil {
|
||||
asyncSpan.SetStatus(codes.Error, "error uploading snapshot")
|
||||
asyncSpan.RecordError(err)
|
||||
|
||||
s.log.Error("uploading snapshot", "err", err.Error())
|
||||
// Update status to error with retries
|
||||
if err := s.updateSnapshotWithRetries(context.Background(), cloudmigration.UpdateSnapshotCmd{
|
||||
if err := s.updateSnapshotWithRetries(asyncCtx, cloudmigration.UpdateSnapshotCmd{
|
||||
UID: snapshot.UID,
|
||||
SessionID: sessionUid,
|
||||
Status: cloudmigration.SnapshotStatusError,
|
||||
}); err != nil {
|
||||
asyncSpan.RecordError(err)
|
||||
s.log.Error("critical failure during snapshot upload - please report any error logs")
|
||||
}
|
||||
}
|
||||
|
||||
s.report(ctx, session, gmsclient.EventDoneUploadingSnapshot, time.Since(start), err)
|
||||
s.report(asyncCtx, session, gmsclient.EventDoneUploadingSnapshot, time.Since(start), err)
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Service) CancelSnapshot(ctx context.Context, sessionUid string, snapshotUid string) (err error) {
|
||||
ctx, span := s.tracer.Start(ctx, "CloudMigrationService.CancelSnapshot",
|
||||
trace.WithAttributes(
|
||||
attribute.String("sessionUid", sessionUid),
|
||||
attribute.String("snapshotUid", snapshotUid),
|
||||
),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
// The cancel func itself is protected by a mutex in the async threads, so it may or may not be set by the time CancelSnapshot is called
|
||||
// Attempt to cancel and recover from the panic if the cancel function is nil
|
||||
defer func() {
|
||||
@@ -684,6 +731,9 @@ func (s *Service) report(
|
||||
d time.Duration,
|
||||
evtErr error,
|
||||
) {
|
||||
ctx, span := s.tracer.Start(ctx, "CloudMigrationService.report")
|
||||
defer span.End()
|
||||
|
||||
id, err := s.getLocalEventId(ctx)
|
||||
if err != nil {
|
||||
s.log.Error("failed to report event", "type", t, "error", err.Error())
|
||||
@@ -738,6 +788,9 @@ func (s *Service) getLocalEventId(ctx context.Context) (string, error) {
|
||||
}
|
||||
|
||||
func (s *Service) deleteLocalFiles(snapshots []cloudmigration.CloudMigrationSnapshot) error {
|
||||
_, span := s.tracer.Start(context.Background(), "CloudMigrationService.deleteLocalFiles")
|
||||
defer span.End()
|
||||
|
||||
var err error
|
||||
for _, snapshot := range snapshots {
|
||||
err = os.RemoveAll(snapshot.LocalDir)
|
||||
|
||||
Reference in New Issue
Block a user