grafana/pkg/storage/unified/client.go

package unified

import (
	"context"
	"fmt"
	"path/filepath"
	"time"

	otgrpc "github.com/opentracing-contrib/go-grpc"
	"github.com/opentracing/opentracing-go"
	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promauto"
	"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
	"gocloud.dev/blob/fileblob"
	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	"github.com/grafana/authlib/types"
	"github.com/grafana/dskit/flagext"
	"github.com/grafana/dskit/grpcclient"
	"github.com/grafana/dskit/middleware"
	"github.com/grafana/dskit/services"
	infraDB "github.com/grafana/grafana/pkg/infra/db"
	"github.com/grafana/grafana/pkg/infra/tracing"
	secrets "github.com/grafana/grafana/pkg/registry/apis/secret/contracts"
	"github.com/grafana/grafana/pkg/services/apiserver/options"
	"github.com/grafana/grafana/pkg/services/featuremgmt"
	"github.com/grafana/grafana/pkg/setting"
	"github.com/grafana/grafana/pkg/storage/legacysql"
	"github.com/grafana/grafana/pkg/storage/unified/federated"
	"github.com/grafana/grafana/pkg/storage/unified/resource"
	"github.com/grafana/grafana/pkg/storage/unified/search"
	"github.com/grafana/grafana/pkg/storage/unified/sql"
	"github.com/grafana/grafana/pkg/util/scheduler"
)

type Options struct {
	Cfg          *setting.Cfg
	Features     featuremgmt.FeatureToggles
	DB           infraDB.DB
	Tracer       tracing.Tracer
	Reg          prometheus.Registerer
	Authzc       types.AccessClient
	Docs         resource.DocumentBuilderSupplier
	SecureValues secrets.InlineSecureValueSupport
}

type clientMetrics struct {
	requestDuration *prometheus.HistogramVec
	requestRetries  *prometheus.CounterVec
}

// This adds a UnifiedStorage client into the wire dependency tree
func ProvideUnifiedStorageClient(opts *Options,
	storageMetrics *resource.StorageMetrics,
	indexMetrics *resource.BleveIndexMetrics,
) (resource.ResourceClient, error) {
	// See: apiserver.applyAPIServerConfig(cfg, features, o)
	apiserverCfg := opts.Cfg.SectionWithEnvOverrides("grafana-apiserver")
	client, err := newClient(options.StorageOptions{
		StorageType:         options.StorageType(apiserverCfg.Key("storage_type").MustString(string(options.StorageTypeUnified))),
		DataPath:            apiserverCfg.Key("storage_path").MustString(filepath.Join(opts.Cfg.DataPath, "grafana-apiserver")),
		Address:             apiserverCfg.Key("address").MustString(""),
		SearchServerAddress: apiserverCfg.Key("search_server_address").MustString(""),
		BlobStoreURL:        apiserverCfg.Key("blob_url").MustString(""),
		BlobThresholdBytes:  apiserverCfg.Key("blob_threshold_bytes").MustInt(options.BlobThresholdDefault),
	}, opts.Cfg, opts.Features, opts.DB, opts.Tracer, opts.Reg, opts.Authzc, opts.Docs, storageMetrics, indexMetrics, opts.SecureValues)
	if err == nil {
		// Used to get the folder stats
		client = federated.NewFederatedClient(
			client, // The original
			legacysql.NewDatabaseProvider(opts.DB),
		)
	}

	return client, err
}

func newClient(opts options.StorageOptions,
	cfg *setting.Cfg,
	features featuremgmt.FeatureToggles,
	db infraDB.DB,
	tracer tracing.Tracer,
	reg prometheus.Registerer,
	authzc types.AccessClient,
	docs resource.DocumentBuilderSupplier,
	storageMetrics *resource.StorageMetrics,
	indexMetrics *resource.BleveIndexMetrics,
	secure secrets.InlineSecureValueSupport,
) (resource.ResourceClient, error) {
	ctx := context.Background()

	switch opts.StorageType {
	case options.StorageTypeFile:
		if opts.DataPath == "" {
			opts.DataPath = filepath.Join(cfg.DataPath, "grafana-apiserver")
		}
		bucket, err := fileblob.OpenBucket(filepath.Join(opts.DataPath, "resource"), &fileblob.Options{
			CreateDir: true,
			Metadata:  fileblob.MetadataDontWrite, // skip
		})
		if err != nil {
			return nil, err
		}
		backend, err := resource.NewCDKBackend(ctx, resource.CDKBackendOptions{
			Bucket: bucket,
		})
		if err != nil {
			return nil, err
		}
		server, err := resource.NewResourceServer(resource.ResourceServerOptions{
			Backend: backend,
			Blob: resource.BlobConfig{
				URL: opts.BlobStoreURL,
			},
		})
		if err != nil {
			return nil, err
		}
		return resource.NewLocalResourceClient(server), nil

	case options.StorageTypeUnifiedGrpc:
		if opts.Address == "" {
			return nil, fmt.Errorf("expecting address for storage_type: %s", opts.StorageType)
		}

		var (
			conn      grpc.ClientConnInterface
			indexConn grpc.ClientConnInterface
			err       error
			metrics   = newClientMetrics(reg)
		)

		conn, err = newGrpcConn(opts.Address, metrics, features)
		if err != nil {
			return nil, err
		}

		if opts.SearchServerAddress != "" {
			indexConn, err = newGrpcConn(opts.SearchServerAddress, metrics, features)

			if err != nil {
				return nil, err
			}
		} else {
			indexConn = conn
		}

		// Create a client instance
		client, err := resource.NewResourceClient(conn, indexConn, cfg, features, tracer)
		if err != nil {
			return nil, err
		}
		return client, nil

	default:
		searchOptions, err := search.NewSearchOptions(features, cfg, tracer, docs, indexMetrics, nil)
		if err != nil {
			return nil, err
		}

		serverOptions := sql.ServerOptions{
			DB:             db,
			Cfg:            cfg,
			Tracer:         tracer,
			Reg:            reg,
			AccessClient:   authzc,
			SearchOptions:  searchOptions,
			StorageMetrics: storageMetrics,
			IndexMetrics:   indexMetrics,
			Features:       features,
			SecureValues:   secure,
		}

		if cfg.QOSEnabled {
			qosReg := prometheus.WrapRegistererWithPrefix("resource_server_qos_", reg)
			queue := scheduler.NewQueue(&scheduler.QueueOptions{
				MaxSizePerTenant: cfg.QOSMaxSizePerTenant,
				Registerer:       qosReg,
				Logger:           cfg.Logger,
			})
			if err := services.StartAndAwaitRunning(ctx, queue); err != nil {
				return nil, fmt.Errorf("failed to start queue: %w", err)
			}
			scheduler, err := scheduler.NewScheduler(queue, &scheduler.Config{
				NumWorkers: cfg.QOSNumberWorker,
				Logger:     cfg.Logger,
			})
			if err != nil {
				return nil, fmt.Errorf("failed to create scheduler: %w", err)
			}

			err = services.StartAndAwaitRunning(ctx, scheduler)
			if err != nil {
				return nil, fmt.Errorf("failed to start scheduler: %w", err)
			}
			serverOptions.QOSQueue = queue
		}

		server, err := sql.NewResourceServer(serverOptions)
		if err != nil {
			return nil, err
		}
		return resource.NewLocalResourceClient(server), nil
	}
}

func newGrpcConn(address string, metrics *clientMetrics, features featuremgmt.FeatureToggles) (grpc.ClientConnInterface, error) {
	// Create either a connection pool or a single connection.
	// The connection pool __can__ be useful when connection to
	// server side load balancers like kube-proxy.
	if features.IsEnabledGlobally(featuremgmt.FlagUnifiedStorageGrpcConnectionPool) {
		conn, err := newPooledConn(&poolOpts{
			initialCapacity: 3,
			maxCapacity:     6,
			idleTimeout:     time.Minute,
			factory: func() (*grpc.ClientConn, error) {
				return grpcConn(address, metrics)
			},
		})
		if err != nil {
			return nil, err
		}

		return conn, nil
	}

	conn, err := grpcConn(address, metrics)
	if err != nil {
		return nil, err
	}

	return conn, nil
}

// grpcConn creates a new gRPC connection to the provided address.
func grpcConn(address string, metrics *clientMetrics) (*grpc.ClientConn, error) {
	// Report gRPC status code errors as labels.
	unary, stream := instrument(metrics.requestDuration, middleware.ReportGRPCStatusOption)

	// Add middleware to retry on transient connection issues. Note that
	// we do not implement it for streams, as we don't currently use streams.
	retryCfg := retryConfig{
		Max:           3,
		Backoff:       time.Second,
		BackoffJitter: 0.5,
	}
	unary = append(unary, unaryRetryInterceptor(retryCfg))
	unary = append(unary, unaryRetryInstrument(metrics.requestRetries))

	cfg := grpcclient.Config{}
	// Set the defaults that are normally set by Config.RegisterFlags.
	flagext.DefaultValues(&cfg)

	opts, err := cfg.DialOption(unary, stream, nil)
	if err != nil {
		return nil, fmt.Errorf("could not instrument grpc client: %w", err)
	}

	opts = append(opts, grpc.WithStatsHandler(otelgrpc.NewClientHandler()))
	opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials()))

	// Use round_robin to balances requests more evenly over the available Storage server.
	opts = append(opts, grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy":"round_robin"}`))

	// Disable looking up service config from TXT DNS records.
	// This reduces the number of requests made to the DNS servers.
	opts = append(opts, grpc.WithDisableServiceConfig())

	// Create a connection to the gRPC server
	return grpc.NewClient(address, opts...)
}

// GrpcConn is the public constructor that can be used for testing.
func GrpcConn(address string, reg prometheus.Registerer) (*grpc.ClientConn, error) {
	metrics := newClientMetrics(reg)
	return grpcConn(address, metrics)
}

// instrument is the same as grpcclient.Instrument but without the middleware.ClientUserHeaderInterceptor
// and middleware.StreamClientUserHeaderInterceptor as we don't need them.
func instrument(requestDuration *prometheus.HistogramVec, instrumentationLabelOptions ...middleware.InstrumentationOption) ([]grpc.UnaryClientInterceptor, []grpc.StreamClientInterceptor) {
	return []grpc.UnaryClientInterceptor{
			otgrpc.OpenTracingClientInterceptor(opentracing.GlobalTracer()),
			middleware.UnaryClientInstrumentInterceptor(requestDuration, instrumentationLabelOptions...),
		}, []grpc.StreamClientInterceptor{
			otgrpc.OpenTracingStreamClientInterceptor(opentracing.GlobalTracer()),
			middleware.StreamClientInstrumentInterceptor(requestDuration, instrumentationLabelOptions...),
		}
}

func newClientMetrics(reg prometheus.Registerer) *clientMetrics {
	// This works for now as the Provide function is only called once during startup.
	// We might eventually want to tight this factory to a struct for more runtime control.
	return &clientMetrics{
		requestDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
			Name:    "resource_server_client_request_duration_seconds",
			Help:    "Time spent executing requests to the resource server.",
			Buckets: prometheus.ExponentialBuckets(0.008, 4, 7),
		}, []string{"operation", "status_code"}),
		requestRetries: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
			Name: "resource_server_client_request_retries_total",
			Help: "Total number of retries for requests to the resource server.",
		}, []string{"operation"}),
	}
}