[v9.4.x] fix(alerting): fallback to dashboard to get the full targets (#77693)

* [v9.4.x] fix(alerting): fallback to dashboard to get the full targets

* fix unit test

* fix more tests

* fix go lint
This commit is contained in:
Jean-Philippe Quéméner
2023-11-06 14:06:59 +01:00
committed by GitHub
parent 087cf5a877
commit bf2e034e49
5 changed files with 404 additions and 24 deletions
@@ -7,9 +7,11 @@ import (
"github.com/grafana/grafana/pkg/components/simplejson"
"github.com/grafana/grafana/pkg/expr"
"github.com/grafana/grafana/pkg/infra/log"
legacymodels "github.com/grafana/grafana/pkg/services/alerting/models"
"github.com/grafana/grafana/pkg/services/dashboards"
"github.com/grafana/grafana/pkg/services/datasources"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/tsdb/graphite"
)
const (
@@ -111,7 +113,17 @@ func (m *migration) makeAlertRule(cond condition, da dashAlert, folderUID string
annotations["message"] = da.Message
var err error
data, err := migrateAlertRuleQueries(cond.Data)
d, err := m.fetchDashboard(da.OrgId, da.DashboardUID)
// Getting the dashboard is important, as we might want to extract some data out of it. Especially for Graphite queries.Ï
if err != nil {
return nil, fmt.Errorf("failed to migrate alert rule queries: related dashboard could not be loaded: %w", err)
}
dsTypeMap, err := m.fetchDsTypes(cond.Data)
if err != nil {
return nil, fmt.Errorf("failed to migrate alert rule queries: datasources could not be loaded: %w", err)
}
l := log.New("ngalert.migration.queries")
data, err := migrateAlertRuleQueries(l, da.Id, cond.Data, da.PanelId, d, dsTypeMap)
if err != nil {
return nil, fmt.Errorf("failed to migrate alert rule queries: %w", err)
}
@@ -173,8 +185,65 @@ func (m *migration) makeAlertRule(cond condition, da dashAlert, folderUID string
return ar, nil
}
func (m *migration) fetchDashboard(orgID int64, dashboardUID string) (*dashboards.Dashboard, error) {
// This is a hack for unit tests in 9.4.
if m.sess == nil {
return &dashboards.Dashboard{}, nil
}
var queryResult *dashboards.Dashboard
dashboard := dashboards.Dashboard{OrgID: orgID, UID: dashboardUID}
has, err := m.sess.Get(&dashboard)
if err != nil {
return nil, err
} else if !has {
return nil, dashboards.ErrDashboardNotFound
}
dashboard.SetID(dashboard.ID)
dashboard.SetUID(dashboard.UID)
queryResult = &dashboard
return queryResult, err
}
type dsType struct {
UID string `xorm:"uid"`
Type string `xorm:"type"`
}
func (dsType) TableName() string {
return "data_source"
}
func (m *migration) fetchDsTypes(data []alertQuery) (map[string]string, error) {
// This is a hack for unit tests in 9.4.
if m.sess == nil {
return map[string]string{}, nil
}
result := make(map[string]string)
for _, q := range data {
result[q.DatasourceUID] = ""
}
var dsTypes []*dsType
for _, uid := range result {
dsTypes = append(dsTypes, &dsType{UID: uid})
}
err := m.sess.Find(&dsTypes)
for _, ds := range dsTypes {
result[ds.UID] = ds.Type
}
return result, err
}
// migrateAlertRuleQueries attempts to fix alert rule queries so they can work in unified alerting. Queries of some data sources are not compatible with unified alerting.
func migrateAlertRuleQueries(data []alertQuery) ([]alertQuery, error) {
func migrateAlertRuleQueries(l log.Logger, ruleID int64, data []alertQuery, panelID int64, dashboard *dashboards.Dashboard, dsTypes map[string]string) ([]alertQuery, error) {
result := make([]alertQuery, 0, len(data))
for _, d := range data {
// queries that are expression are not relevant, skip them.
@@ -182,6 +251,14 @@ func migrateAlertRuleQueries(data []alertQuery) ([]alertQuery, error) {
result = append(result, d)
continue
}
dsType, ok := dsTypes[d.DatasourceUID]
if !ok {
l.Error("datasource not found", "uid", d.DatasourceUID)
return nil, fmt.Errorf("datasource not found")
}
if dsType != datasources.DS_GRAPHITE {
continue
}
var fixedData map[string]json.RawMessage
err := json.Unmarshal(d.Model, &fixedData)
if err != nil {
@@ -189,7 +266,7 @@ func migrateAlertRuleQueries(data []alertQuery) ([]alertQuery, error) {
}
// remove hidden tag from the query (if exists)
delete(fixedData, "hide")
fixedData = fixGraphiteReferencedSubQueries(fixedData)
fixedData = fixGraphiteReferencedSubQueries(l, fixedData, ruleID, panelID, dashboard)
updatedModel, err := json.Marshal(fixedData)
if err != nil {
return nil, err
@@ -200,18 +277,6 @@ func migrateAlertRuleQueries(data []alertQuery) ([]alertQuery, error) {
return result, nil
}
// fixGraphiteReferencedSubQueries attempts to fix graphite referenced sub queries, given unified alerting does not support this.
// targetFull of Graphite data source contains the expanded version of field 'target', so let's copy that.
func fixGraphiteReferencedSubQueries(queryData map[string]json.RawMessage) map[string]json.RawMessage {
fullQuery, ok := queryData[graphite.TargetFullModelField]
if ok {
delete(queryData, graphite.TargetFullModelField)
queryData[graphite.TargetModelField] = fullQuery
}
return queryData
}
type alertQuery struct {
// RefID is the unique identifier of the query, set by the frontend call.
RefID string `json:"refId"`
@@ -305,7 +370,10 @@ func normalizeRuleName(daName string, uid string) string {
trunc := DefaultFieldMaxLength - 1 - len(uid)
daName = daName[:trunc] + "_" + uid
}
// The name can be empty, as this validation did not always exist.
if daName == "" {
return uid
}
return daName
}
@@ -0,0 +1,204 @@
package ualert
import (
"encoding/json"
"errors"
"fmt"
"regexp"
"strings"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/dashboards"
"github.com/grafana/grafana/pkg/tsdb/graphite"
)
var (
hasPlaceholdersRe = regexp.MustCompile(`#([A-Za-z]+)`)
)
var (
failedGraphiteMigrations int64
successfulGraphiteMigrationCopy int64
successfulGraphiteMigrationDashboard int64
)
type panel struct {
ID int64 `json:"id"`
Targets []target `json:"targets"`
Panels []panel `json:"panels"`
}
type target struct {
RefID string `json:"refId"`
Target string `json:"target"`
}
// fixGraphiteReferencedSubQueries attempts to fix graphite referenced sub queries, given unified alerting does not support this.
// targetFull of Graphite data source contains the expanded version of field 'target', so let's copy that.
func fixGraphiteReferencedSubQueries(l log.Logger, queryData map[string]json.RawMessage, ruleID, panelID int64, dashboard *dashboards.Dashboard) map[string]json.RawMessage {
if !isFixable(l, queryData) {
return queryData
}
fullQuery, ok := queryData[graphite.TargetFullModelField]
// We also need to check for placeholders here, as the data can be very old and suffer from a bug that only
// the first placeholder of the same name got replaced.
if ok && !hasPlaceholders(string(fullQuery)) {
successfulGraphiteMigrationCopy++
delete(queryData, graphite.TargetFullModelField)
queryData[graphite.TargetModelField] = fullQuery
return queryData
}
// Sometimes it can happen that the "targetFull" field is not there. In this case we can
// extract this information from the panel of the dashboard.
fullQueryRaw, err := unwrapFromDashboard(l, queryData, panelID, dashboard)
if err != nil {
failedGraphiteMigrations++
l.Error("graphite query migration: failed to unwrap query from dashboard", "err", err, "rule_id", ruleID)
return queryData
}
b, err := json.Marshal(fullQueryRaw)
if err != nil {
failedGraphiteMigrations++
l.Error("graphite query migration: failed to marshal the unwrapped query", "query", fullQueryRaw, "err", err, "rule_id", ruleID)
return queryData
}
successfulGraphiteMigrationDashboard++
l.Debug("graphite query migration: successfully unwrapped query using the dashboard", "query", fullQueryRaw, "rule_id", ruleID)
queryData[graphite.TargetModelField] = b
return queryData
}
func isFixable(l log.Logger, queryData map[string]json.RawMessage) bool {
_, ok := queryData[graphite.TargetFullModelField]
if ok {
return true
}
// Check if the target field has any placeholders.
targetRaw, ok := queryData[graphite.TargetModelField]
if !ok {
l.Error("query data does not have field 'target'")
return false
}
var target string
err := json.Unmarshal(targetRaw, &target)
if err != nil {
l.Error("failed to unmarshal target", "err", err)
return false
}
if !hasPlaceholders(target) {
return false
}
return true
}
func unwrapFromDashboard(l log.Logger, queryData map[string]json.RawMessage, panelID int64, dashboard *dashboards.Dashboard) (string, error) {
refIDRaw, ok := queryData["refId"]
if !ok {
return "", fmt.Errorf("query data does not have field 'refId'")
}
var refID string
err := json.Unmarshal(refIDRaw, &refID)
if err != nil {
return "", fmt.Errorf("failed to unmarshal refId: %w", err)
}
panelsRaw := dashboard.Data.Get("panels")
// Simplejson doesn't let you unmarshal in a type, so we need to work around this.
b, err := panelsRaw.MarshalJSON()
if err != nil {
return "", fmt.Errorf("failed to marshal panels: %w", err)
}
var panels []panel
err = json.Unmarshal(b, &panels)
if err != nil {
return "", fmt.Errorf("failed to unmarshal panels: %w", err)
}
return unwrapFromPanel(l, panels, panelID, refID, 0)
}
var (
errPanelNotFound = errors.New("panel not found")
)
func unwrapFromPanel(l log.Logger, panels []panel, panelID int64, refID string, parent int64) (string, error) {
for _, panel := range panels {
if panel.ID == panelID {
return unwrapTarget(refID, panel.Targets)
}
if len(panel.Panels) > 0 {
unwrappedTarget, err := unwrapFromPanel(l, panel.Panels, panelID, refID, panel.ID)
if err == nil {
return unwrappedTarget, nil
}
if !errors.Is(err, errPanelNotFound) {
return "", err
}
}
}
return "", errPanelNotFound
}
func unwrapTarget(refID string, targets []target) (string, error) {
m := make(map[string]string)
visited := make(map[string]bool)
// Populate the map with the target strings.
for _, t := range targets {
m[t.RefID] = t.Target
}
_, ok := m[refID]
if !ok {
return "", fmt.Errorf("refID %s not found in targets", refID)
}
// Check for circular dependencies by keeping track of visited refIDs.
// The detection of circular dependencies happens by marking a refID as visited
// before we start its replacement process, and unmarking it after we're done.
// This way, if we encounter the same refID during its own replacement process,
// we know it's a circular dependency.
var unwrap func(string) (string, error)
unwrap = func(currentRefID string) (string, error) {
if visited[currentRefID] {
return "", fmt.Errorf("circular dependency on refID %s", currentRefID)
}
visited[currentRefID] = true
defer func() { visited[currentRefID] = false }()
currentResult, ok := m[currentRefID]
if !ok {
return "", fmt.Errorf("refID %s not found in targets", currentRefID)
}
matches := hasPlaceholdersRe.FindAllStringSubmatch(currentResult, -1)
for _, match := range matches {
placeholderRef := match[1]
replacement, err := unwrap(placeholderRef)
if err != nil {
return "", err
}
currentResult = strings.ReplaceAll(currentResult, "#"+placeholderRef, replacement)
}
return currentResult, nil
}
return unwrap(refID)
}
func hasPlaceholders(s string) bool {
// Use the regular expression to match the string
return hasPlaceholdersRe.MatchString(s)
}
func logGraphiteMigrationStats(l log.Logger) {
if successfulGraphiteMigrationDashboard == 0 && successfulGraphiteMigrationCopy == 0 && failedGraphiteMigrations == 0 {
return
}
l.Info("Graphite migration stats",
"failed", failedGraphiteMigrations,
"success_copy", successfulGraphiteMigrationCopy,
"success_dashboard", successfulGraphiteMigrationDashboard)
}
@@ -0,0 +1,97 @@
package ualert
import (
"testing"
"github.com/stretchr/testify/require"
)
func TestUnwrapTarget(t *testing.T) {
// Define the test cases
testCases := []struct {
name string
refID string
targets []target
expected string
wantErr bool
}{
{
name: "Valid reference substitution",
refID: "C",
targets: []target{
{
RefID: "A",
Target: "first.query.count",
},
{
RefID: "B",
Target: "second.query.count",
},
{
RefID: "C",
Target: "scale(asPercent(diffSeries(#B, #A), #B), 100)",
},
},
expected: "scale(asPercent(diffSeries(second.query.count, first.query.count), second.query.count), 100)",
wantErr: false,
},
{
name: "No error on multiple matches",
refID: "D",
targets: []target{
{
RefID: "D",
Target: "alias(divideSeries(scale(#C, 100.0), #B), 'SLI')",
},
{
RefID: "C",
Target: "integral(aggregations.secondly.chef.api.client.domain.transaction.statusok.count)",
},
{
RefID: "B",
Target: "sumSeries(#A, #C)",
},
{
RefID: "A",
Target: "integral(aggregations.secondly.chef.api.client.domain.transaction.statusnook.count)",
},
},
expected: "alias(divideSeries(scale(integral(aggregations.secondly.chef.api.client.domain.transaction.statusok.count), 100.0), sumSeries(integral(aggregations.secondly.chef.api.client.domain.transaction.statusnook.count), integral(aggregations.secondly.chef.api.client.domain.transaction.statusok.count))), 'SLI')",
wantErr: false,
},
{
name: "Error on circular references",
refID: "D",
targets: []target{
{
RefID: "D",
Target: "alias(divideSeries(scale(#C, 100.0), #B), 'SLI')",
},
{
RefID: "B",
Target: "sumSeries(#A, #D)",
},
{
RefID: "A",
Target: "integral(aggregations.secondly.chef.api.client.domain.transaction.statusnook.count)",
},
},
wantErr: true,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
// Act
result, err := unwrapTarget(tc.refID, tc.targets)
// Assert
if tc.wantErr {
require.Error(t, err, "unwrapTarget should return an error")
} else {
require.NoError(t, err, "unwrapTarget should not return an error")
require.Equal(t, tc.expected, result, "unwrapTarget returned unexpected result")
}
})
}
}
@@ -8,6 +8,8 @@ import (
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/components/simplejson"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/dashboards"
)
func TestMigrateAlertRuleQueries(t *testing.T) {
@@ -18,18 +20,25 @@ func TestMigrateAlertRuleQueries(t *testing.T) {
err error
}{
{
name: "when a query has a sub query - it is extracted",
input: simplejson.NewFromAny(map[string]interface{}{"targetFull": "thisisafullquery", "target": "ahalfquery"}),
name: "when a query has a sub query - it is extracted",
input: simplejson.NewFromAny(map[string]interface{}{
"targetFull": "thisisafullquery",
"target": "ahalfquery",
}),
expected: `{"target":"thisisafullquery"}`,
},
{
name: "when a query does not have a sub query - it no-ops",
input: simplejson.NewFromAny(map[string]interface{}{"target": "ahalfquery"}),
name: "when a query does not have a sub query - it no-ops",
input: simplejson.NewFromAny(map[string]interface{}{
"target": "ahalfquery",
}),
expected: `{"target":"ahalfquery"}`,
},
{
name: "when query was hidden, it removes the flag",
input: simplejson.NewFromAny(map[string]interface{}{"hide": true}),
name: "when query was hidden, it removes the flag",
input: simplejson.NewFromAny(map[string]interface{}{
"hide": true,
}),
expected: `{}`,
},
}
@@ -38,7 +47,7 @@ func TestMigrateAlertRuleQueries(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
model, err := tt.input.Encode()
require.NoError(t, err)
queries, err := migrateAlertRuleQueries([]alertQuery{{Model: model}})
queries, err := migrateAlertRuleQueries(log.NewNopLogger(), 0, []alertQuery{{Model: model, DatasourceUID: "a"}}, 0, &dashboards.Dashboard{}, map[string]string{"a": "graphite"})
if tt.err != nil {
require.Error(t, err)
require.EqualError(t, err, tt.err.Error())
@@ -391,6 +391,8 @@ func (m *migration) Exec(sess *xorm.Session, mg *migrator.Migrator) error {
}
}
logGraphiteMigrationStats(m.mg.Logger)
for orgID := range rulesPerOrg {
if err := m.addPauseSilence(orgID); err != nil {
m.mg.Logger.Error("alert migration error: failed to create silence for paused alerts")