[v9.4.x] fix(alerting): fallback to dashboard to get the full targets (#77693)
* [v9.4.x] fix(alerting): fallback to dashboard to get the full targets * fix unit test * fix more tests * fix go lint
This commit is contained in:
committed by
GitHub
parent
087cf5a877
commit
bf2e034e49
@@ -7,9 +7,11 @@ import (
|
||||
|
||||
"github.com/grafana/grafana/pkg/components/simplejson"
|
||||
"github.com/grafana/grafana/pkg/expr"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
legacymodels "github.com/grafana/grafana/pkg/services/alerting/models"
|
||||
"github.com/grafana/grafana/pkg/services/dashboards"
|
||||
"github.com/grafana/grafana/pkg/services/datasources"
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/tsdb/graphite"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -111,7 +113,17 @@ func (m *migration) makeAlertRule(cond condition, da dashAlert, folderUID string
|
||||
annotations["message"] = da.Message
|
||||
var err error
|
||||
|
||||
data, err := migrateAlertRuleQueries(cond.Data)
|
||||
d, err := m.fetchDashboard(da.OrgId, da.DashboardUID)
|
||||
// Getting the dashboard is important, as we might want to extract some data out of it. Especially for Graphite queries.Ï
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to migrate alert rule queries: related dashboard could not be loaded: %w", err)
|
||||
}
|
||||
dsTypeMap, err := m.fetchDsTypes(cond.Data)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to migrate alert rule queries: datasources could not be loaded: %w", err)
|
||||
}
|
||||
l := log.New("ngalert.migration.queries")
|
||||
data, err := migrateAlertRuleQueries(l, da.Id, cond.Data, da.PanelId, d, dsTypeMap)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to migrate alert rule queries: %w", err)
|
||||
}
|
||||
@@ -173,8 +185,65 @@ func (m *migration) makeAlertRule(cond condition, da dashAlert, folderUID string
|
||||
return ar, nil
|
||||
}
|
||||
|
||||
func (m *migration) fetchDashboard(orgID int64, dashboardUID string) (*dashboards.Dashboard, error) {
|
||||
// This is a hack for unit tests in 9.4.
|
||||
if m.sess == nil {
|
||||
return &dashboards.Dashboard{}, nil
|
||||
}
|
||||
|
||||
var queryResult *dashboards.Dashboard
|
||||
|
||||
dashboard := dashboards.Dashboard{OrgID: orgID, UID: dashboardUID}
|
||||
|
||||
has, err := m.sess.Get(&dashboard)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else if !has {
|
||||
return nil, dashboards.ErrDashboardNotFound
|
||||
}
|
||||
|
||||
dashboard.SetID(dashboard.ID)
|
||||
dashboard.SetUID(dashboard.UID)
|
||||
queryResult = &dashboard
|
||||
|
||||
return queryResult, err
|
||||
}
|
||||
|
||||
type dsType struct {
|
||||
UID string `xorm:"uid"`
|
||||
Type string `xorm:"type"`
|
||||
}
|
||||
|
||||
func (dsType) TableName() string {
|
||||
return "data_source"
|
||||
}
|
||||
|
||||
func (m *migration) fetchDsTypes(data []alertQuery) (map[string]string, error) {
|
||||
// This is a hack for unit tests in 9.4.
|
||||
if m.sess == nil {
|
||||
return map[string]string{}, nil
|
||||
}
|
||||
|
||||
result := make(map[string]string)
|
||||
for _, q := range data {
|
||||
result[q.DatasourceUID] = ""
|
||||
}
|
||||
var dsTypes []*dsType
|
||||
for _, uid := range result {
|
||||
dsTypes = append(dsTypes, &dsType{UID: uid})
|
||||
}
|
||||
|
||||
err := m.sess.Find(&dsTypes)
|
||||
|
||||
for _, ds := range dsTypes {
|
||||
result[ds.UID] = ds.Type
|
||||
}
|
||||
return result, err
|
||||
}
|
||||
|
||||
// migrateAlertRuleQueries attempts to fix alert rule queries so they can work in unified alerting. Queries of some data sources are not compatible with unified alerting.
|
||||
func migrateAlertRuleQueries(data []alertQuery) ([]alertQuery, error) {
|
||||
func migrateAlertRuleQueries(l log.Logger, ruleID int64, data []alertQuery, panelID int64, dashboard *dashboards.Dashboard, dsTypes map[string]string) ([]alertQuery, error) {
|
||||
result := make([]alertQuery, 0, len(data))
|
||||
for _, d := range data {
|
||||
// queries that are expression are not relevant, skip them.
|
||||
@@ -182,6 +251,14 @@ func migrateAlertRuleQueries(data []alertQuery) ([]alertQuery, error) {
|
||||
result = append(result, d)
|
||||
continue
|
||||
}
|
||||
dsType, ok := dsTypes[d.DatasourceUID]
|
||||
if !ok {
|
||||
l.Error("datasource not found", "uid", d.DatasourceUID)
|
||||
return nil, fmt.Errorf("datasource not found")
|
||||
}
|
||||
if dsType != datasources.DS_GRAPHITE {
|
||||
continue
|
||||
}
|
||||
var fixedData map[string]json.RawMessage
|
||||
err := json.Unmarshal(d.Model, &fixedData)
|
||||
if err != nil {
|
||||
@@ -189,7 +266,7 @@ func migrateAlertRuleQueries(data []alertQuery) ([]alertQuery, error) {
|
||||
}
|
||||
// remove hidden tag from the query (if exists)
|
||||
delete(fixedData, "hide")
|
||||
fixedData = fixGraphiteReferencedSubQueries(fixedData)
|
||||
fixedData = fixGraphiteReferencedSubQueries(l, fixedData, ruleID, panelID, dashboard)
|
||||
updatedModel, err := json.Marshal(fixedData)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -200,18 +277,6 @@ func migrateAlertRuleQueries(data []alertQuery) ([]alertQuery, error) {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// fixGraphiteReferencedSubQueries attempts to fix graphite referenced sub queries, given unified alerting does not support this.
|
||||
// targetFull of Graphite data source contains the expanded version of field 'target', so let's copy that.
|
||||
func fixGraphiteReferencedSubQueries(queryData map[string]json.RawMessage) map[string]json.RawMessage {
|
||||
fullQuery, ok := queryData[graphite.TargetFullModelField]
|
||||
if ok {
|
||||
delete(queryData, graphite.TargetFullModelField)
|
||||
queryData[graphite.TargetModelField] = fullQuery
|
||||
}
|
||||
|
||||
return queryData
|
||||
}
|
||||
|
||||
type alertQuery struct {
|
||||
// RefID is the unique identifier of the query, set by the frontend call.
|
||||
RefID string `json:"refId"`
|
||||
@@ -305,7 +370,10 @@ func normalizeRuleName(daName string, uid string) string {
|
||||
trunc := DefaultFieldMaxLength - 1 - len(uid)
|
||||
daName = daName[:trunc] + "_" + uid
|
||||
}
|
||||
|
||||
// The name can be empty, as this validation did not always exist.
|
||||
if daName == "" {
|
||||
return uid
|
||||
}
|
||||
return daName
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,204 @@
|
||||
package ualert
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/dashboards"
|
||||
"github.com/grafana/grafana/pkg/tsdb/graphite"
|
||||
)
|
||||
|
||||
var (
|
||||
hasPlaceholdersRe = regexp.MustCompile(`#([A-Za-z]+)`)
|
||||
)
|
||||
|
||||
var (
|
||||
failedGraphiteMigrations int64
|
||||
successfulGraphiteMigrationCopy int64
|
||||
successfulGraphiteMigrationDashboard int64
|
||||
)
|
||||
|
||||
type panel struct {
|
||||
ID int64 `json:"id"`
|
||||
Targets []target `json:"targets"`
|
||||
Panels []panel `json:"panels"`
|
||||
}
|
||||
|
||||
type target struct {
|
||||
RefID string `json:"refId"`
|
||||
Target string `json:"target"`
|
||||
}
|
||||
|
||||
// fixGraphiteReferencedSubQueries attempts to fix graphite referenced sub queries, given unified alerting does not support this.
|
||||
// targetFull of Graphite data source contains the expanded version of field 'target', so let's copy that.
|
||||
func fixGraphiteReferencedSubQueries(l log.Logger, queryData map[string]json.RawMessage, ruleID, panelID int64, dashboard *dashboards.Dashboard) map[string]json.RawMessage {
|
||||
if !isFixable(l, queryData) {
|
||||
return queryData
|
||||
}
|
||||
fullQuery, ok := queryData[graphite.TargetFullModelField]
|
||||
// We also need to check for placeholders here, as the data can be very old and suffer from a bug that only
|
||||
// the first placeholder of the same name got replaced.
|
||||
if ok && !hasPlaceholders(string(fullQuery)) {
|
||||
successfulGraphiteMigrationCopy++
|
||||
delete(queryData, graphite.TargetFullModelField)
|
||||
queryData[graphite.TargetModelField] = fullQuery
|
||||
return queryData
|
||||
}
|
||||
|
||||
// Sometimes it can happen that the "targetFull" field is not there. In this case we can
|
||||
// extract this information from the panel of the dashboard.
|
||||
fullQueryRaw, err := unwrapFromDashboard(l, queryData, panelID, dashboard)
|
||||
if err != nil {
|
||||
failedGraphiteMigrations++
|
||||
l.Error("graphite query migration: failed to unwrap query from dashboard", "err", err, "rule_id", ruleID)
|
||||
return queryData
|
||||
}
|
||||
b, err := json.Marshal(fullQueryRaw)
|
||||
if err != nil {
|
||||
failedGraphiteMigrations++
|
||||
l.Error("graphite query migration: failed to marshal the unwrapped query", "query", fullQueryRaw, "err", err, "rule_id", ruleID)
|
||||
return queryData
|
||||
}
|
||||
successfulGraphiteMigrationDashboard++
|
||||
l.Debug("graphite query migration: successfully unwrapped query using the dashboard", "query", fullQueryRaw, "rule_id", ruleID)
|
||||
queryData[graphite.TargetModelField] = b
|
||||
|
||||
return queryData
|
||||
}
|
||||
|
||||
func isFixable(l log.Logger, queryData map[string]json.RawMessage) bool {
|
||||
_, ok := queryData[graphite.TargetFullModelField]
|
||||
if ok {
|
||||
return true
|
||||
}
|
||||
// Check if the target field has any placeholders.
|
||||
targetRaw, ok := queryData[graphite.TargetModelField]
|
||||
if !ok {
|
||||
l.Error("query data does not have field 'target'")
|
||||
return false
|
||||
}
|
||||
var target string
|
||||
err := json.Unmarshal(targetRaw, &target)
|
||||
if err != nil {
|
||||
l.Error("failed to unmarshal target", "err", err)
|
||||
return false
|
||||
}
|
||||
if !hasPlaceholders(target) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func unwrapFromDashboard(l log.Logger, queryData map[string]json.RawMessage, panelID int64, dashboard *dashboards.Dashboard) (string, error) {
|
||||
refIDRaw, ok := queryData["refId"]
|
||||
if !ok {
|
||||
return "", fmt.Errorf("query data does not have field 'refId'")
|
||||
}
|
||||
var refID string
|
||||
err := json.Unmarshal(refIDRaw, &refID)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to unmarshal refId: %w", err)
|
||||
}
|
||||
panelsRaw := dashboard.Data.Get("panels")
|
||||
// Simplejson doesn't let you unmarshal in a type, so we need to work around this.
|
||||
b, err := panelsRaw.MarshalJSON()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to marshal panels: %w", err)
|
||||
}
|
||||
var panels []panel
|
||||
err = json.Unmarshal(b, &panels)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to unmarshal panels: %w", err)
|
||||
}
|
||||
return unwrapFromPanel(l, panels, panelID, refID, 0)
|
||||
}
|
||||
|
||||
var (
|
||||
errPanelNotFound = errors.New("panel not found")
|
||||
)
|
||||
|
||||
func unwrapFromPanel(l log.Logger, panels []panel, panelID int64, refID string, parent int64) (string, error) {
|
||||
for _, panel := range panels {
|
||||
if panel.ID == panelID {
|
||||
return unwrapTarget(refID, panel.Targets)
|
||||
}
|
||||
if len(panel.Panels) > 0 {
|
||||
unwrappedTarget, err := unwrapFromPanel(l, panel.Panels, panelID, refID, panel.ID)
|
||||
if err == nil {
|
||||
return unwrappedTarget, nil
|
||||
}
|
||||
if !errors.Is(err, errPanelNotFound) {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
}
|
||||
return "", errPanelNotFound
|
||||
}
|
||||
|
||||
func unwrapTarget(refID string, targets []target) (string, error) {
|
||||
m := make(map[string]string)
|
||||
visited := make(map[string]bool)
|
||||
|
||||
// Populate the map with the target strings.
|
||||
for _, t := range targets {
|
||||
m[t.RefID] = t.Target
|
||||
}
|
||||
|
||||
_, ok := m[refID]
|
||||
if !ok {
|
||||
return "", fmt.Errorf("refID %s not found in targets", refID)
|
||||
}
|
||||
|
||||
// Check for circular dependencies by keeping track of visited refIDs.
|
||||
// The detection of circular dependencies happens by marking a refID as visited
|
||||
// before we start its replacement process, and unmarking it after we're done.
|
||||
// This way, if we encounter the same refID during its own replacement process,
|
||||
// we know it's a circular dependency.
|
||||
var unwrap func(string) (string, error)
|
||||
unwrap = func(currentRefID string) (string, error) {
|
||||
if visited[currentRefID] {
|
||||
return "", fmt.Errorf("circular dependency on refID %s", currentRefID)
|
||||
}
|
||||
|
||||
visited[currentRefID] = true
|
||||
defer func() { visited[currentRefID] = false }()
|
||||
|
||||
currentResult, ok := m[currentRefID]
|
||||
if !ok {
|
||||
return "", fmt.Errorf("refID %s not found in targets", currentRefID)
|
||||
}
|
||||
|
||||
matches := hasPlaceholdersRe.FindAllStringSubmatch(currentResult, -1)
|
||||
for _, match := range matches {
|
||||
placeholderRef := match[1]
|
||||
replacement, err := unwrap(placeholderRef)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
currentResult = strings.ReplaceAll(currentResult, "#"+placeholderRef, replacement)
|
||||
}
|
||||
|
||||
return currentResult, nil
|
||||
}
|
||||
|
||||
return unwrap(refID)
|
||||
}
|
||||
|
||||
func hasPlaceholders(s string) bool {
|
||||
// Use the regular expression to match the string
|
||||
return hasPlaceholdersRe.MatchString(s)
|
||||
}
|
||||
|
||||
func logGraphiteMigrationStats(l log.Logger) {
|
||||
if successfulGraphiteMigrationDashboard == 0 && successfulGraphiteMigrationCopy == 0 && failedGraphiteMigrations == 0 {
|
||||
return
|
||||
}
|
||||
l.Info("Graphite migration stats",
|
||||
"failed", failedGraphiteMigrations,
|
||||
"success_copy", successfulGraphiteMigrationCopy,
|
||||
"success_dashboard", successfulGraphiteMigrationDashboard)
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
package ualert
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestUnwrapTarget(t *testing.T) {
|
||||
// Define the test cases
|
||||
testCases := []struct {
|
||||
name string
|
||||
refID string
|
||||
targets []target
|
||||
expected string
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "Valid reference substitution",
|
||||
refID: "C",
|
||||
targets: []target{
|
||||
{
|
||||
RefID: "A",
|
||||
Target: "first.query.count",
|
||||
},
|
||||
{
|
||||
RefID: "B",
|
||||
Target: "second.query.count",
|
||||
},
|
||||
{
|
||||
RefID: "C",
|
||||
Target: "scale(asPercent(diffSeries(#B, #A), #B), 100)",
|
||||
},
|
||||
},
|
||||
expected: "scale(asPercent(diffSeries(second.query.count, first.query.count), second.query.count), 100)",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "No error on multiple matches",
|
||||
refID: "D",
|
||||
targets: []target{
|
||||
{
|
||||
RefID: "D",
|
||||
Target: "alias(divideSeries(scale(#C, 100.0), #B), 'SLI')",
|
||||
},
|
||||
{
|
||||
RefID: "C",
|
||||
Target: "integral(aggregations.secondly.chef.api.client.domain.transaction.statusok.count)",
|
||||
},
|
||||
{
|
||||
RefID: "B",
|
||||
Target: "sumSeries(#A, #C)",
|
||||
},
|
||||
{
|
||||
RefID: "A",
|
||||
Target: "integral(aggregations.secondly.chef.api.client.domain.transaction.statusnook.count)",
|
||||
},
|
||||
},
|
||||
expected: "alias(divideSeries(scale(integral(aggregations.secondly.chef.api.client.domain.transaction.statusok.count), 100.0), sumSeries(integral(aggregations.secondly.chef.api.client.domain.transaction.statusnook.count), integral(aggregations.secondly.chef.api.client.domain.transaction.statusok.count))), 'SLI')",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "Error on circular references",
|
||||
refID: "D",
|
||||
targets: []target{
|
||||
{
|
||||
RefID: "D",
|
||||
Target: "alias(divideSeries(scale(#C, 100.0), #B), 'SLI')",
|
||||
},
|
||||
{
|
||||
RefID: "B",
|
||||
Target: "sumSeries(#A, #D)",
|
||||
},
|
||||
{
|
||||
RefID: "A",
|
||||
Target: "integral(aggregations.secondly.chef.api.client.domain.transaction.statusnook.count)",
|
||||
},
|
||||
},
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
// Act
|
||||
result, err := unwrapTarget(tc.refID, tc.targets)
|
||||
|
||||
// Assert
|
||||
if tc.wantErr {
|
||||
require.Error(t, err, "unwrapTarget should return an error")
|
||||
} else {
|
||||
require.NoError(t, err, "unwrapTarget should not return an error")
|
||||
require.Equal(t, tc.expected, result, "unwrapTarget returned unexpected result")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -8,6 +8,8 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/grafana/grafana/pkg/components/simplejson"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/dashboards"
|
||||
)
|
||||
|
||||
func TestMigrateAlertRuleQueries(t *testing.T) {
|
||||
@@ -18,18 +20,25 @@ func TestMigrateAlertRuleQueries(t *testing.T) {
|
||||
err error
|
||||
}{
|
||||
{
|
||||
name: "when a query has a sub query - it is extracted",
|
||||
input: simplejson.NewFromAny(map[string]interface{}{"targetFull": "thisisafullquery", "target": "ahalfquery"}),
|
||||
name: "when a query has a sub query - it is extracted",
|
||||
input: simplejson.NewFromAny(map[string]interface{}{
|
||||
"targetFull": "thisisafullquery",
|
||||
"target": "ahalfquery",
|
||||
}),
|
||||
expected: `{"target":"thisisafullquery"}`,
|
||||
},
|
||||
{
|
||||
name: "when a query does not have a sub query - it no-ops",
|
||||
input: simplejson.NewFromAny(map[string]interface{}{"target": "ahalfquery"}),
|
||||
name: "when a query does not have a sub query - it no-ops",
|
||||
input: simplejson.NewFromAny(map[string]interface{}{
|
||||
"target": "ahalfquery",
|
||||
}),
|
||||
expected: `{"target":"ahalfquery"}`,
|
||||
},
|
||||
{
|
||||
name: "when query was hidden, it removes the flag",
|
||||
input: simplejson.NewFromAny(map[string]interface{}{"hide": true}),
|
||||
name: "when query was hidden, it removes the flag",
|
||||
input: simplejson.NewFromAny(map[string]interface{}{
|
||||
"hide": true,
|
||||
}),
|
||||
expected: `{}`,
|
||||
},
|
||||
}
|
||||
@@ -38,7 +47,7 @@ func TestMigrateAlertRuleQueries(t *testing.T) {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
model, err := tt.input.Encode()
|
||||
require.NoError(t, err)
|
||||
queries, err := migrateAlertRuleQueries([]alertQuery{{Model: model}})
|
||||
queries, err := migrateAlertRuleQueries(log.NewNopLogger(), 0, []alertQuery{{Model: model, DatasourceUID: "a"}}, 0, &dashboards.Dashboard{}, map[string]string{"a": "graphite"})
|
||||
if tt.err != nil {
|
||||
require.Error(t, err)
|
||||
require.EqualError(t, err, tt.err.Error())
|
||||
|
||||
@@ -391,6 +391,8 @@ func (m *migration) Exec(sess *xorm.Session, mg *migrator.Migrator) error {
|
||||
}
|
||||
}
|
||||
|
||||
logGraphiteMigrationStats(m.mg.Logger)
|
||||
|
||||
for orgID := range rulesPerOrg {
|
||||
if err := m.addPauseSilence(orgID); err != nil {
|
||||
m.mg.Logger.Error("alert migration error: failed to create silence for paused alerts")
|
||||
|
||||
Reference in New Issue
Block a user