Files
grafana/pkg/storage/unified/sql/service.go
owensmallwood a3daf0e39d Unified storage: Add quotas app to apiserver (#114425)
* initial generation

* went through doc to add new resource

* added dummy kind so grafana will run

* added dummy handler and custom route

* fix app name

* gets custom route working - still a dummy route

* adds groupOverride to manifest

* adds quotas to grpc client and server

* WIP - trying to get api recognized - not working

* Gets route working

* fixes group and resource vars

* expects group and resource as separate params

* set content-type header on response

* removes Quotas kind and regens

* Update grafana-app-sdk to v0.48.5

* Update codegen

* updates manifest

* formatting

* updates grafana-app-sdk version to 0.48.5

* regen ResourceClient mocks

* adds tests

* remove commented code

* uncomment go mod tidy

* fix tests and make update workspace

* adds quotas app to codeowners

* formatting

* make gen-apps

* deletes temp file

* fix generated folder code

* make gofmt

* make gen-go

* make update-workspace

* add COPY apps/quotas to Dockerfile

* fix test mock

* fixes undefined NewFolderStatus()

* make gen-apps, and add func for NewFolderStatus

* make gen-apps again

* make update-workspace

* regen folder_object_gen.go

* gofmt

* fix linting

* apps/folder make update-workspace

* make gen-apps

* make gen-apps

* fixes enterprise_imports.go

* go get testcontainers

* adds feature toggle

* make update-workspace

* fix go mod

* fix another client mock

---------

Co-authored-by: Steve Simpson <steve@grafana.com>
2025-12-09 09:40:34 -06:00

484 lines
14 KiB
Go

package sql
import (
"context"
"errors"
"fmt"
"hash/fnv"
"net"
"net/http"
"os"
"strconv"
"time"
"github.com/gorilla/mux"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
"google.golang.org/grpc/health/grpc_health_v1"
"github.com/grafana/authlib/grpcutils"
"github.com/grafana/dskit/kv"
"github.com/grafana/dskit/netutil"
"github.com/grafana/dskit/ring"
"github.com/grafana/dskit/services"
infraDB "github.com/grafana/grafana/pkg/infra/db"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/modules"
"github.com/grafana/grafana/pkg/services/authz"
"github.com/grafana/grafana/pkg/services/featuremgmt"
"github.com/grafana/grafana/pkg/services/grpcserver"
"github.com/grafana/grafana/pkg/services/grpcserver/interceptors"
"github.com/grafana/grafana/pkg/setting"
"github.com/grafana/grafana/pkg/storage/unified/resource"
"github.com/grafana/grafana/pkg/storage/unified/resource/grpc"
"github.com/grafana/grafana/pkg/storage/unified/resourcepb"
"github.com/grafana/grafana/pkg/storage/unified/search"
"github.com/grafana/grafana/pkg/util/scheduler"
)
var (
_ UnifiedStorageGrpcService = (*service)(nil)
)
type UnifiedStorageGrpcService interface {
services.NamedService
// Return the address where this service is running
GetAddress() string
}
type service struct {
*services.BasicService
// Subservices manager
subservices *services.Manager
subservicesWatcher *services.FailureWatcher
hasSubservices bool
backend resource.StorageBackend
cfg *setting.Cfg
features featuremgmt.FeatureToggles
db infraDB.DB
stopCh chan struct{}
stoppedCh chan error
handler grpcserver.Provider
tracing trace.Tracer
authenticator func(ctx context.Context) (context.Context, error)
log log.Logger
reg prometheus.Registerer
storageMetrics *resource.StorageMetrics
indexMetrics *resource.BleveIndexMetrics
docBuilders resource.DocumentBuilderSupplier
searchRing *ring.Ring
ringLifecycler *ring.BasicLifecycler
queue QOSEnqueueDequeuer
scheduler *scheduler.Scheduler
}
func ProvideUnifiedStorageGrpcService(
cfg *setting.Cfg,
features featuremgmt.FeatureToggles,
db infraDB.DB,
log log.Logger,
reg prometheus.Registerer,
docBuilders resource.DocumentBuilderSupplier,
storageMetrics *resource.StorageMetrics,
indexMetrics *resource.BleveIndexMetrics,
searchRing *ring.Ring,
memberlistKVConfig kv.Config,
httpServerRouter *mux.Router,
backend resource.StorageBackend,
) (UnifiedStorageGrpcService, error) {
var err error
tracer := otel.Tracer("unified-storage")
// FIXME: This is a temporary solution while we are migrating to the new authn interceptor
// grpcutils.NewGrpcAuthenticator should be used instead.
authn := NewAuthenticatorWithFallback(cfg, reg, tracer, func(ctx context.Context) (context.Context, error) {
auth := grpc.Authenticator{Tracer: tracer}
return auth.Authenticate(ctx)
})
s := &service{
backend: backend,
cfg: cfg,
features: features,
stopCh: make(chan struct{}),
authenticator: authn,
tracing: tracer,
db: db,
log: log,
reg: reg,
docBuilders: docBuilders,
storageMetrics: storageMetrics,
indexMetrics: indexMetrics,
searchRing: searchRing,
subservicesWatcher: services.NewFailureWatcher(),
}
subservices := []services.Service{}
if cfg.EnableSharding {
ringStore, err := kv.NewClient(
memberlistKVConfig,
ring.GetCodec(),
kv.RegistererWithKVName(reg, resource.RingName),
log,
)
if err != nil {
return nil, fmt.Errorf("failed to create KV store client: %s", err)
}
lifecyclerCfg, err := toLifecyclerConfig(cfg, log)
if err != nil {
return nil, fmt.Errorf("failed to initialize storage-ring lifecycler config: %s", err)
}
// Define lifecycler delegates in reverse order (last to be called defined first because they're
// chained via "next delegate").
delegate := ring.BasicLifecyclerDelegate(ring.NewInstanceRegisterDelegate(ring.JOINING, resource.RingNumTokens))
delegate = ring.NewLeaveOnStoppingDelegate(delegate, log)
delegate = ring.NewAutoForgetDelegate(resource.RingHeartbeatTimeout*2, delegate, log)
s.ringLifecycler, err = ring.NewBasicLifecycler(
lifecyclerCfg,
resource.RingName,
resource.RingKey,
ringStore,
delegate,
log,
reg,
)
if err != nil {
return nil, fmt.Errorf("failed to initialize storage-ring lifecycler: %s", err)
}
s.ringLifecycler.SetKeepInstanceInTheRingOnShutdown(true)
subservices = append(subservices, s.ringLifecycler)
if httpServerRouter != nil {
httpServerRouter.Path("/prepare-downscale").Methods("GET", "POST", "DELETE").Handler(http.HandlerFunc(s.PrepareDownscale))
}
}
if cfg.QOSEnabled {
qosReg := prometheus.WrapRegistererWithPrefix("resource_server_qos_", reg)
queue := scheduler.NewQueue(&scheduler.QueueOptions{
MaxSizePerTenant: cfg.QOSMaxSizePerTenant,
Registerer: qosReg,
})
scheduler, err := scheduler.NewScheduler(queue, &scheduler.Config{
NumWorkers: cfg.QOSNumberWorker,
Logger: log,
})
if err != nil {
return nil, fmt.Errorf("failed to create qos scheduler: %s", err)
}
s.queue = queue
s.scheduler = scheduler
subservices = append(subservices, s.queue, s.scheduler)
}
if len(subservices) > 0 {
s.hasSubservices = true
s.subservices, err = services.NewManager(subservices...)
if err != nil {
return nil, fmt.Errorf("failed to create subservices manager: %w", err)
}
}
// This will be used when running as a dskit service
s.BasicService = services.NewBasicService(s.starting, s.running, s.stopping).WithName(modules.StorageServer)
return s, nil
}
func (s *service) PrepareDownscale(w http.ResponseWriter, r *http.Request) {
switch r.Method {
case http.MethodPost:
s.log.Info("Preparing for downscale. Will not keep instance in ring on shutdown.")
s.ringLifecycler.SetKeepInstanceInTheRingOnShutdown(false)
case http.MethodDelete:
s.log.Info("Downscale canceled. Will keep instance in ring on shutdown.")
s.ringLifecycler.SetKeepInstanceInTheRingOnShutdown(true)
case http.MethodGet:
// used for delayed downscale use case, which we don't support. Leaving here for completion sake
s.log.Info("Received GET request for prepare-downscale. Behavior not implemented.")
default:
}
}
var (
// operation used by the search-servers to check if they own the namespace
searchOwnerRead = ring.NewOp([]ring.InstanceState{ring.JOINING, ring.ACTIVE, ring.LEAVING}, nil)
)
func (s *service) OwnsIndex(key resource.NamespacedResource) (bool, error) {
if s.searchRing == nil {
return true, nil
}
if st := s.searchRing.State(); st != services.Running {
return false, fmt.Errorf("ring is not Running: %s", st)
}
ringHasher := fnv.New32a()
_, err := ringHasher.Write([]byte(key.Namespace))
if err != nil {
return false, fmt.Errorf("error hashing namespace: %w", err)
}
rs, err := s.searchRing.GetWithOptions(ringHasher.Sum32(), searchOwnerRead, ring.WithReplicationFactor(s.searchRing.ReplicationFactor()))
if err != nil {
return false, fmt.Errorf("error getting replicaset from ring: %w", err)
}
return rs.Includes(s.ringLifecycler.GetInstanceAddr()), nil
}
func (s *service) starting(ctx context.Context) error {
if s.hasSubservices {
s.subservicesWatcher.WatchManager(s.subservices)
if err := services.StartManagerAndAwaitHealthy(ctx, s.subservices); err != nil {
return fmt.Errorf("failed to start subservices: %w", err)
}
}
authzClient, err := authz.ProvideStandaloneAuthZClient(s.cfg, s.features, s.tracing, s.reg)
if err != nil {
return err
}
searchOptions, err := search.NewSearchOptions(s.features, s.cfg, s.docBuilders, s.indexMetrics, s.OwnsIndex)
if err != nil {
return err
}
serverOptions := ServerOptions{
Backend: s.backend,
DB: s.db,
Cfg: s.cfg,
Tracer: s.tracing,
Reg: s.reg,
AccessClient: authzClient,
SearchOptions: searchOptions,
StorageMetrics: s.storageMetrics,
IndexMetrics: s.indexMetrics,
Features: s.features,
QOSQueue: s.queue,
OwnsIndexFn: s.OwnsIndex,
}
if s.cfg.OverridesFilePath != "" {
overridesSvc, err := resource.NewOverridesService(context.Background(), s.log, s.reg, s.tracing, resource.ReloadOptions{
FilePath: s.cfg.OverridesFilePath,
ReloadPeriod: s.cfg.OverridesReloadInterval,
})
if err != nil {
return err
}
serverOptions.OverridesService = overridesSvc
}
server, err := NewResourceServer(serverOptions)
if err != nil {
return err
}
s.handler, err = grpcserver.ProvideService(s.cfg, s.features, interceptors.AuthenticatorFunc(s.authenticator), s.tracing, prometheus.DefaultRegisterer)
if err != nil {
return err
}
healthService, err := resource.ProvideHealthService(server)
if err != nil {
return err
}
srv := s.handler.GetServer()
resourcepb.RegisterResourceStoreServer(srv, server)
resourcepb.RegisterBulkStoreServer(srv, server)
resourcepb.RegisterResourceIndexServer(srv, server)
resourcepb.RegisterManagedObjectIndexServer(srv, server)
resourcepb.RegisterBlobStoreServer(srv, server)
resourcepb.RegisterDiagnosticsServer(srv, server)
resourcepb.RegisterQuotasServer(srv, server)
grpc_health_v1.RegisterHealthServer(srv, healthService)
// register reflection service
_, err = grpcserver.ProvideReflectionService(s.cfg, s.handler)
if err != nil {
return err
}
if s.cfg.EnableSharding {
s.log.Info("waiting until resource server is JOINING in the ring")
lfcCtx, cancel := context.WithTimeout(context.Background(), s.cfg.ResourceServerJoinRingTimeout)
defer cancel()
if err := ring.WaitInstanceState(lfcCtx, s.searchRing, s.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil {
return fmt.Errorf("error switching to JOINING in the ring: %s", err)
}
s.log.Info("resource server is JOINING in the ring")
if err := s.ringLifecycler.ChangeState(ctx, ring.ACTIVE); err != nil {
return fmt.Errorf("error switching to ACTIVE in the ring: %s", err)
}
s.log.Info("resource server is ACTIVE in the ring")
}
// start the gRPC server
go func() {
err := s.handler.Run(ctx)
if err != nil {
s.stoppedCh <- err
} else {
s.stoppedCh <- nil
}
}()
return nil
}
// GetAddress returns the address of the gRPC server.
func (s *service) GetAddress() string {
return s.handler.GetAddress()
}
func (s *service) running(ctx context.Context) error {
select {
case err := <-s.stoppedCh:
if err != nil && !errors.Is(err, context.Canceled) {
return err
}
case err := <-s.subservicesWatcher.Chan():
return fmt.Errorf("subservice failure: %w", err)
case <-ctx.Done():
close(s.stopCh)
}
return nil
}
func (s *service) stopping(_ error) error {
if s.hasSubservices {
err := services.StopManagerAndAwaitStopped(context.Background(), s.subservices)
if err != nil {
return fmt.Errorf("failed to stop subservices: %w", err)
}
}
return nil
}
type authenticatorWithFallback struct {
authenticator func(ctx context.Context) (context.Context, error)
fallback func(ctx context.Context) (context.Context, error)
metrics *metrics
tracer trace.Tracer
}
type metrics struct {
requestsTotal *prometheus.CounterVec
}
func (f *authenticatorWithFallback) Authenticate(ctx context.Context) (context.Context, error) {
ctx, span := f.tracer.Start(ctx, "grpcutils.AuthenticatorWithFallback.Authenticate")
defer span.End()
// Try to authenticate with the new authenticator first
span.SetAttributes(attribute.Bool("fallback_used", false))
newCtx, err := f.authenticator(ctx)
if err == nil {
// fallback not used, authentication successful
f.metrics.requestsTotal.WithLabelValues("false", "true").Inc()
return newCtx, nil
}
// In case of error, fallback to the legacy authenticator
span.SetAttributes(attribute.Bool("fallback_used", true))
newCtx, err = f.fallback(ctx)
if newCtx != nil {
newCtx = resource.WithFallback(newCtx)
}
f.metrics.requestsTotal.WithLabelValues("true", fmt.Sprintf("%t", err == nil)).Inc()
return newCtx, err
}
func newMetrics(reg prometheus.Registerer) *metrics {
return &metrics{
requestsTotal: promauto.With(reg).NewCounterVec(
prometheus.CounterOpts{
Name: "grafana_grpc_authenticator_with_fallback_requests_total",
Help: "Number requests using the authenticator with fallback",
}, []string{"fallback_used", "result"}),
}
}
func ReadGrpcServerConfig(cfg *setting.Cfg) *grpcutils.AuthenticatorConfig {
section := cfg.SectionWithEnvOverrides("grpc_server_authentication")
return &grpcutils.AuthenticatorConfig{
SigningKeysURL: section.Key("signing_keys_url").MustString(""),
AllowedAudiences: section.Key("allowed_audiences").Strings(","),
AllowInsecure: cfg.Env == setting.Dev,
}
}
func NewAuthenticatorWithFallback(cfg *setting.Cfg, reg prometheus.Registerer, tracer trace.Tracer, fallback func(context.Context) (context.Context, error)) func(context.Context) (context.Context, error) {
authCfg := ReadGrpcServerConfig(cfg)
authenticator := grpcutils.NewAuthenticator(authCfg, tracer)
metrics := newMetrics(reg)
return func(ctx context.Context) (context.Context, error) {
a := &authenticatorWithFallback{
authenticator: authenticator,
fallback: fallback,
tracer: tracer,
metrics: metrics,
}
return a.Authenticate(ctx)
}
}
func toLifecyclerConfig(cfg *setting.Cfg, logger log.Logger) (ring.BasicLifecyclerConfig, error) {
instanceAddr, err := ring.GetInstanceAddr(cfg.MemberlistBindAddr, netutil.PrivateNetworkInterfacesWithFallback([]string{"eth0", "en0"}, logger), logger, true)
if err != nil {
return ring.BasicLifecyclerConfig{}, err
}
instanceId := cfg.InstanceID
if instanceId == "" {
hostname, err := os.Hostname()
if err != nil {
return ring.BasicLifecyclerConfig{}, err
}
instanceId = hostname
}
_, grpcPortStr, err := net.SplitHostPort(cfg.GRPCServer.Address)
if err != nil {
return ring.BasicLifecyclerConfig{}, fmt.Errorf("could not get grpc port from grpc server address: %s", err)
}
grpcPort, err := strconv.Atoi(grpcPortStr)
if err != nil {
return ring.BasicLifecyclerConfig{}, fmt.Errorf("error converting grpc address port to int: %s", err)
}
return ring.BasicLifecyclerConfig{
Addr: fmt.Sprintf("%s:%d", instanceAddr, grpcPort),
ID: instanceId,
HeartbeatPeriod: 15 * time.Second,
HeartbeatTimeout: resource.RingHeartbeatTimeout,
TokensObservePeriod: 0,
NumTokens: resource.RingNumTokens,
}, nil
}