SQL Expressions: Rework backend errors and error instrumentation (#109633)

* Capture error_type label on metrics/traces
* Make error messages more helpful to user
* Use errutil, categorized errors, and tie them to error_type (category in code)
* Misc trace fixes
* Add metric to track SQL input conversion
This commit is contained in:
Kyle Brandt
2025-08-25 11:13:42 -04:00
committed by GitHub
parent 539b413584
commit 4f0cb47d3c
18 changed files with 858 additions and 299 deletions
+166 -42
View File
@@ -8,9 +8,12 @@ import (
"github.com/grafana/grafana-plugin-sdk-go/backend"
"github.com/grafana/grafana-plugin-sdk-go/data"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
semconv "go.opentelemetry.io/otel/semconv/v1.17.0"
"go.opentelemetry.io/otel/trace"
"github.com/grafana/grafana/pkg/apimachinery/errutil"
"github.com/grafana/grafana-plugin-sdk-go/backend/log"
"github.com/grafana/grafana/pkg/expr/mathexp"
"github.com/grafana/grafana/pkg/expr/metrics"
"github.com/grafana/grafana/pkg/expr/sql"
@@ -18,15 +21,7 @@ import (
"github.com/grafana/grafana/pkg/setting"
)
var (
ErrMissingSQLQuery = errutil.BadRequest("sql-missing-query").Errorf("missing SQL query")
ErrInvalidSQLQuery = errutil.BadRequest("sql-invalid-sql").MustTemplate(
"invalid SQL query: {{ .Private.query }} err: {{ .Error }}",
errutil.WithPublic(
"Invalid SQL query: {{ .Public.error }}",
),
)
)
const SQLLoggerName = "expr.sql"
// SQLCommand is an expression to run SQL over results
type SQLCommand struct {
@@ -39,31 +34,25 @@ type SQLCommand struct {
inputLimit int64
outputLimit int64
timeout time.Duration
logger log.Logger
}
// NewSQLCommand creates a new SQLCommand.
func NewSQLCommand(ctx context.Context, refID, format, rawSQL string, intputLimit, outputLimit int64, timeout time.Duration) (*SQLCommand, error) {
func NewSQLCommand(ctx context.Context, logger log.Logger, refID, format, rawSQL string, intputLimit, outputLimit int64, timeout time.Duration) (*SQLCommand, error) {
sqlLogger := backend.NewLoggerWith("logger", SQLLoggerName).FromContext(ctx)
if rawSQL == "" {
return nil, ErrMissingSQLQuery
return nil, sql.MakeErrEmptyQuery(refID)
}
tables, err := sql.TablesList(ctx, rawSQL)
if err != nil {
logger.Warn("invalid sql query", "sql", rawSQL, "error", err)
return nil, ErrInvalidSQLQuery.Build(errutil.TemplateData{
Error: err,
Public: map[string]any{
"error": err.Error(),
},
Private: map[string]any{
"query": rawSQL,
},
})
sqlLogger.Warn("invalid sql query", "sql", rawSQL, "error", err)
return nil, sql.MakeErrInvalidQuery(refID, err)
}
if len(tables) == 0 {
logger.Warn("no tables found in SQL query", "sql", rawSQL)
sqlLogger.Warn("no tables found in SQL query", "sql", rawSQL)
}
if tables != nil {
logger.Debug("REF tables", "tables", tables, "sql", rawSQL)
sqlLogger.Debug("REF tables", "tables", tables, "sql", rawSQL)
}
return &SQLCommand{
@@ -74,14 +63,15 @@ func NewSQLCommand(ctx context.Context, refID, format, rawSQL string, intputLimi
outputLimit: outputLimit,
timeout: timeout,
format: format,
logger: sqlLogger,
}, nil
}
// UnmarshalSQLCommand creates a SQLCommand from Grafana's frontend query.
func UnmarshalSQLCommand(ctx context.Context, rn *rawNode, cfg *setting.Cfg) (*SQLCommand, error) {
sqlLogger := backend.NewLoggerWith("logger", "expr.sql").FromContext(ctx)
sqlLogger := backend.NewLoggerWith("logger", SQLLoggerName).FromContext(ctx)
if rn.TimeRange == nil {
logger.Error("time range must be specified for refID", "refID", rn.RefID)
sqlLogger.Error("time range must be specified for refID", "refID", rn.RefID)
return nil, fmt.Errorf("time range must be specified for refID %s", rn.RefID)
}
@@ -99,7 +89,7 @@ func UnmarshalSQLCommand(ctx context.Context, rn *rawNode, cfg *setting.Cfg) (*S
formatRaw := rn.Query["format"]
format, _ := formatRaw.(string)
return NewSQLCommand(ctx, rn.RefID, format, expression, cfg.SQLExpressionCellLimit, cfg.SQLExpressionOutputCellLimit, cfg.SQLExpressionTimeout)
return NewSQLCommand(ctx, sqlLogger, rn.RefID, format, expression, cfg.SQLExpressionCellLimit, cfg.SQLExpressionOutputCellLimit, cfg.SQLExpressionTimeout)
}
// NeedsVars returns the variable names (refIds) that are dependencies
@@ -113,23 +103,32 @@ func (gr *SQLCommand) NeedsVars() []string {
func (gr *SQLCommand) Execute(ctx context.Context, now time.Time, vars mathexp.Vars, tracer tracing.Tracer, metrics *metrics.ExprMetrics) (mathexp.Results, error) {
_, span := tracer.Start(ctx, "SSE.ExecuteSQL")
start := time.Now()
sqlLogger := backend.NewLoggerWith("logger", "expr.sql").FromContext(ctx)
tc := int64(0)
rsp := mathexp.Results{}
errorType := "none"
defer func() {
duration := float64(time.Since(start).Milliseconds())
statusLabel := "ok"
if rsp.Error != nil {
e := &sql.ErrorWithCategory{}
if errors.As(rsp.Error, &e) {
errorType = e.Category()
} else {
errorType = "unknown"
}
statusLabel = "error"
span.RecordError(rsp.Error)
span.SetStatus(codes.Error, rsp.Error.Error())
sqlLogger.Error("SQL command execution failed", "error", rsp.Error.Error())
span.AddEvent("exception", trace.WithAttributes(
semconv.ExceptionType(errorType),
semconv.ExceptionMessage(rsp.Error.Error()),
))
span.SetAttributes(attribute.String("error.category", errorType))
span.SetStatus(codes.Error, errorType)
gr.logger.Error("SQL command execution failed", "error", rsp.Error.Error(), "error_type", errorType)
}
span.End()
metrics.SqlCommandCount.WithLabelValues(statusLabel).Inc()
metrics.SqlCommandCount.WithLabelValues(statusLabel, errorType).Inc()
metrics.SqlCommandDuration.WithLabelValues(statusLabel).Observe(duration)
metrics.SqlCommandCellCount.WithLabelValues(statusLabel).Observe(float64(tc))
}()
@@ -138,7 +137,7 @@ func (gr *SQLCommand) Execute(ctx context.Context, now time.Time, vars mathexp.V
for _, ref := range gr.varsToQuery {
results, ok := vars[ref]
if !ok {
sqlLogger.Warn("no results found for", "ref", ref)
gr.logger.Warn("no results found for", "ref", ref)
continue
}
frames := results.Values.AsDataFrames(ref)
@@ -149,15 +148,11 @@ func (gr *SQLCommand) Execute(ctx context.Context, now time.Time, vars mathexp.V
// limit of 0 or less means no limit (following convention)
if gr.inputLimit > 0 && tc > gr.inputLimit {
rsp.Error = fmt.Errorf(
"SQL expression: total cell count across all input tables exceeds limit of %d. Total cells: %d",
gr.inputLimit,
tc,
)
rsp.Error = sql.MakeInputLimitExceededError(gr.refID, gr.inputLimit)
return rsp, nil
}
sqlLogger.Debug("Executing query", "query", gr.query, "frames", len(allFrames))
gr.logger.Debug("Executing query", "query", gr.query, "frames", len(allFrames))
db := sql.DB{}
frame, err := db.QueryFrames(ctx, tracer, gr.refID, gr.query, allFrames, sql.WithMaxOutputCells(gr.outputLimit), sql.WithTimeout(gr.timeout))
@@ -166,7 +161,7 @@ func (gr *SQLCommand) Execute(ctx context.Context, now time.Time, vars mathexp.V
return rsp, nil
}
sqlLogger.Debug("Done Executing query", "query", gr.query, "rows", frame.Rows())
gr.logger.Debug("Done Executing query", "query", gr.query, "rows", frame.Rows())
if frame.Rows() == 0 {
rsp.Values = mathexp.Values{
@@ -293,7 +288,7 @@ func extractNumberSetFromSQLForAlerting(frame *data.Frame) ([]mathexp.Number, er
}
if len(duplicates) > 0 {
return nil, makeDuplicateStringColumnError(duplicates)
return nil, sql.MakeDuplicateStringColumnError(duplicates)
}
// Build final result
@@ -307,3 +302,132 @@ func extractNumberSetFromSQLForAlerting(frame *data.Frame) ([]mathexp.Number, er
return numbers, nil
}
// handleSqlInput normalizes input DataFrames into a single dataframe with no labels so it can represent a table for use with SQL expressions.
//
// It handles three cases:
// 1. If the input declares a supported time series or numeric kind in the wide or multi format (via FrameMeta.Type), it converts to a full-long formatted table using ConvertToFullLong.
// 2. If the input is a single frame (no labels, no declared type), it passes through as-is.
// 3. If the input has multiple frames or label metadata but lacks a supported type, it returns an error.
//
// The returned bool indicates if the input was (attempted to be) converted or passed through as-is.
func handleSqlInput(ctx context.Context, tracer trace.Tracer, refID string, forRefIDs map[string]struct{}, dsType string, dataFrames data.Frames) (mathexp.Results, bool) {
_, span := tracer.Start(ctx, "SSE.HandleConvertSQLInput")
start := time.Now()
var result mathexp.Results
errorType := "none"
var metaType data.FrameType
defer func() {
duration := float64(time.Since(start).Milliseconds())
statusLabel := "ok"
if result.Error != nil {
statusLabel = "error"
}
dataType := categorizeFrameInputType(dataFrames)
span.SetAttributes(
attribute.String("status", statusLabel),
attribute.Float64("duration", duration),
attribute.String("data.type", dataType),
attribute.String("datasource.type", dsType),
)
if result.Error != nil {
e := &sql.ErrorWithCategory{}
if errors.As(result.Error, &e) {
errorType = e.Category()
} else {
errorType = "unknown"
}
span.AddEvent("exception", trace.WithAttributes(
semconv.ExceptionType(errorType),
semconv.ExceptionMessage(result.Error.Error()),
))
span.SetAttributes(attribute.String("error.category", errorType))
span.SetStatus(codes.Error, errorType)
}
span.End()
}()
if len(dataFrames) == 0 {
return mathexp.Results{Values: mathexp.Values{mathexp.NewNoData()}}, false
}
first := dataFrames[0]
// Single Frame no data case
// Note: In the case of a support Frame Type, we may want to return the matching schema
// with no rows (e.g. include the `__value__` column). But not sure about this at this time.
if len(dataFrames) == 1 && len(first.Fields) == 0 {
result.Values = mathexp.Values{
mathexp.TableData{Frame: first},
}
return result, false
}
if first.Meta != nil {
metaType = first.Meta.Type
}
if supportedToLongConversion(metaType) {
convertedFrames, err := ConvertToFullLong(dataFrames)
if err != nil {
result.Error = sql.MakeInputConvertError(err, refID, forRefIDs, dsType)
}
if len(convertedFrames) == 0 {
result.Error = fmt.Errorf("conversion succeeded but returned no frames")
return result, true
}
result.Values = mathexp.Values{
mathexp.TableData{Frame: convertedFrames[0]},
}
return result, true
}
// If we don't have a supported type for conversion, see if we can pass through as a table (no labels, and only a single frame)
var frameTypeIssue string
if metaType == "" {
frameTypeIssue = "is missing the data type (frame.meta.type)"
} else {
frameTypeIssue = fmt.Sprintf("has an unsupported data type [%s]", metaType)
}
// If meta.type is not supported, but there are labels or more than 1 frame error
if len(dataFrames) > 1 {
result.Error = sql.MakeInputConvertError(fmt.Errorf("can not convert because the response %s and has more than one dataframe that can not be automatically mapped to a single table", frameTypeIssue), refID, forRefIDs, dsType)
return result, false
}
for _, frame := range dataFrames {
for _, field := range frame.Fields {
if len(field.Labels) > 0 {
result.Error = sql.MakeInputConvertError(fmt.Errorf("can not convert because the response %s and has labels in the response that can not be mapped to a table", frameTypeIssue), refID, forRefIDs, dsType)
return result, false
}
}
}
// Can pass through as table without conversion
result.Values = mathexp.Values{
mathexp.TableData{Frame: first},
}
return result, false
}
func categorizeFrameInputType(dataFrames data.Frames) string {
switch {
case len(dataFrames) == 0:
return "missing"
case dataFrames[0].Meta == nil:
return "missing"
case dataFrames[0].Meta.Type == "":
return "missing"
case dataFrames[0].Meta.Type.IsKnownType():
return string(dataFrames[0].Meta.Type)
default:
return "unknown"
}
}