From fa6eaa8c155890b24459b1ae90e9374346793333 Mon Sep 17 00:00:00 2001 From: Pepe Cano <825430+ppcano@users.noreply.github.com> Date: Wed, 7 Jan 2026 15:16:38 +0100 Subject: [PATCH] [release-12.3.2] docs(alerting): new best practices guide (#115930) docs(alerting): new best practices guide (#115687) * Split best practices section * Write Examples and Guides docs * Move recording rule recommendations * docs(alerting): new best practices guide * fix vale errors * Detail meaning of alert `escalation` * Include the recovery threshold option * Include lower severity channels for infrastructure alerts * Remove timing options * minor intro edits * Rename heading to avoid gerunds (cherry picked from commit 0bfcc55411601a672f840af21f93836ed5f7484d) --- .../create-recording-rules/_index.md | 8 + .../sources/alerting/best-practices/_index.md | 50 ----- docs/sources/alerting/examples/_index.md | 22 ++ .../dynamic-labels.md | 6 +- .../dynamic-thresholds.md | 6 +- .../high-cardinality-alerts.md | 6 +- .../multi-dimensional-alerts.md | 6 +- .../table-data.md | 6 +- .../trace-based-alerts.md | 6 +- .../{best-practices => examples}/tutorials.md | 4 +- docs/sources/alerting/guides/_index.md | 35 +++ .../sources/alerting/guides/best-practices.md | 201 ++++++++++++++++++ .../connectivity-errors.md | 6 +- .../missing-data.md | 6 +- 14 files changed, 301 insertions(+), 67 deletions(-) delete mode 100644 docs/sources/alerting/best-practices/_index.md create mode 100644 docs/sources/alerting/examples/_index.md rename docs/sources/alerting/{best-practices => examples}/dynamic-labels.md (98%) rename docs/sources/alerting/{best-practices => examples}/dynamic-thresholds.md (97%) rename docs/sources/alerting/{best-practices => examples}/high-cardinality-alerts.md (97%) rename docs/sources/alerting/{best-practices => examples}/multi-dimensional-alerts.md (96%) rename docs/sources/alerting/{best-practices => examples}/table-data.md (96%) rename docs/sources/alerting/{best-practices => examples}/trace-based-alerts.md (98%) rename docs/sources/alerting/{best-practices => examples}/tutorials.md (87%) create mode 100644 docs/sources/alerting/guides/_index.md create mode 100644 docs/sources/alerting/guides/best-practices.md rename docs/sources/alerting/{best-practices => guides}/connectivity-errors.md (98%) rename docs/sources/alerting/{best-practices => guides}/missing-data.md (98%) diff --git a/docs/sources/alerting/alerting-rules/create-recording-rules/_index.md b/docs/sources/alerting/alerting-rules/create-recording-rules/_index.md index a56d3ac5bb2..bcf9663aca9 100644 --- a/docs/sources/alerting/alerting-rules/create-recording-rules/_index.md +++ b/docs/sources/alerting/alerting-rules/create-recording-rules/_index.md @@ -48,6 +48,14 @@ Recording rules can be helpful in various scenarios, such as: The evaluation group of the recording rule determines how often the metric is pre-computed. +## Recommendations + +- **Use frequent evaluation intervals**. Set frequent evaluation intervals for recording rules. Long intervals, such as an hour, can cause the recorded metric to be stale and lead to misaligned alert rule evaluations, especially when combined with a long pending period. +- **Align alert evaluation with recording frequency**. The evaluation interval of an alert rule that depends on a recorded metric should be aligned with the recording rule's interval. If a recording rule runs every 3 minutes, the alert rule should also be evaluated at a similar frequency to ensure it acts on fresh data. +- **Use `_over_time` functions for instant queries**. Since all alert rules are ultimately executed as an instant query, you can use functions like `max_over_time(my_metric[5m])` as an instant query. This allows you to get an aggregated value over a period without using a range query and a reduce expression. + +## Types of recording rules + Similar to alert rules, Grafana supports two types of recording rules: 1. [Grafana-managed recording rules](ref:grafana-managed-recording-rules), which can query any Grafana data source supported by alerting. It's the recommended option. diff --git a/docs/sources/alerting/best-practices/_index.md b/docs/sources/alerting/best-practices/_index.md deleted file mode 100644 index 41251a25994..00000000000 --- a/docs/sources/alerting/best-practices/_index.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -canonical: https://grafana.com/docs/grafana/latest/alerting/best-practices/ -description: This section provides a set of guides for useful alerting practices and recommendations -keywords: - - grafana -labels: - products: - - cloud - - enterprise - - oss -menuTitle: Best practices -title: Grafana Alerting best practices -weight: 170 ---- - -# Grafana Alerting best practices - -This section provides a set of guides and examples of best practices for Grafana Alerting. Here you can learn more about how to handle common alert management problems and you can see examples of more advanced usage of Grafana Alerting. - -{{< section >}} - -Designing and configuring an alert management set up that works takes time. Here are some additional tips on how to create an effective alert management set up: - -{{< shared id="alert-planning-fundamentals" >}} - -**Which are the key metrics for your business that you want to monitor and alert on?** - -- Find events that are important to know about and not so trivial or frequent that recipients ignore them. -- Alerts should only be created for big events that require immediate attention or intervention. -- Consider quality over quantity. - -**How do you want to organize your alerts and notifications?** - -- Be selective about who you set to receive alerts. Consider sending them to the right teams, whoever is on call, and the specific channels. -- Think carefully about priority and severity levels. -- Automate as far as possible provisioning Alerting resources with the API or Terraform. - -**Which information should you include in notifications?** - -- Consider who the alert receivers and responders are. -- Share information that helps responders identify and address potential issues. -- Link alerts to dashboards to guide responders on which data to investigate. - -**How can you reduce alert fatigue?** - -- Avoid noisy, unnecessary alerts by using silences, mute timings, or pausing alert rule evaluation. -- Continually tune your alert rules to review effectiveness. Remove alert rules to avoid duplication or ineffective alerts. -- Continually review your thresholds and evaluation rules. - -{{< /shared >}} diff --git a/docs/sources/alerting/examples/_index.md b/docs/sources/alerting/examples/_index.md new file mode 100644 index 00000000000..3b46838b78a --- /dev/null +++ b/docs/sources/alerting/examples/_index.md @@ -0,0 +1,22 @@ +--- +canonical: https://grafana.com/docs/grafana/latest/alerting/examples/ +description: This section provides a set of guides for useful alerting practices and recommendations +keywords: + - grafana +labels: + products: + - cloud + - enterprise + - oss +menuTitle: Examples +title: Examples +weight: 180 +--- + +# Examples + +This section provides practical examples that show how to work with different types of alerting data, apply alert design patterns, reuse alert logic, and take advantage of specific Grafana Alerting features. + +This section includes: + +{{< section >}} diff --git a/docs/sources/alerting/best-practices/dynamic-labels.md b/docs/sources/alerting/examples/dynamic-labels.md similarity index 98% rename from docs/sources/alerting/best-practices/dynamic-labels.md rename to docs/sources/alerting/examples/dynamic-labels.md index 7a9586ea6d9..6223e7ebff6 100644 --- a/docs/sources/alerting/best-practices/dynamic-labels.md +++ b/docs/sources/alerting/examples/dynamic-labels.md @@ -1,5 +1,7 @@ --- -canonical: https://grafana.com/docs/grafana/latest/alerting/best-practices/dynamic-labels +aliases: + - ../best-practices/dynamic-labels/ # /docs/grafana//alerting/best-practices/dynamic-labels/ +canonical: https://grafana.com/docs/grafana/latest/alerting/examples/dynamic-labels description: This example shows how to define dynamic labels based on query values, along with important behavior to keep in mind when using them. keywords: - grafana @@ -10,7 +12,7 @@ labels: - cloud - enterprise - oss -menuTitle: Examples of dynamic labels +menuTitle: Dynamic labels title: Example of dynamic labels in alert instances weight: 1104 refs: diff --git a/docs/sources/alerting/best-practices/dynamic-thresholds.md b/docs/sources/alerting/examples/dynamic-thresholds.md similarity index 97% rename from docs/sources/alerting/best-practices/dynamic-thresholds.md rename to docs/sources/alerting/examples/dynamic-thresholds.md index e749bc1d943..5ce850842a4 100644 --- a/docs/sources/alerting/best-practices/dynamic-thresholds.md +++ b/docs/sources/alerting/examples/dynamic-thresholds.md @@ -1,5 +1,7 @@ --- -canonical: https://grafana.com/docs/grafana/latest/alerting/best-practices/dynamic-thresholds +aliases: + - ../best-practices/dynamic-thresholds/ # /docs/grafana//alerting/best-practices/dynamic-thresholds/ +canonical: https://grafana.com/docs/grafana/latest/alerting/examples/dynamic-thresholds description: This example shows how to use a distinct threshold value per dimension using multi-dimensional alerts and a Math expression. keywords: - grafana @@ -10,7 +12,7 @@ labels: - cloud - enterprise - oss -menuTitle: Examples of dynamic thresholds +menuTitle: Dynamic thresholds title: Example of dynamic thresholds per dimension weight: 1105 refs: diff --git a/docs/sources/alerting/best-practices/high-cardinality-alerts.md b/docs/sources/alerting/examples/high-cardinality-alerts.md similarity index 97% rename from docs/sources/alerting/best-practices/high-cardinality-alerts.md rename to docs/sources/alerting/examples/high-cardinality-alerts.md index 9df60ac6bc6..74d080fbee5 100644 --- a/docs/sources/alerting/best-practices/high-cardinality-alerts.md +++ b/docs/sources/alerting/examples/high-cardinality-alerts.md @@ -1,5 +1,7 @@ --- -canonical: https://grafana.com/docs/grafana/latest/alerting/best-practices/high-cardinality-alerts/ +aliases: + - ../best-practices/high-cardinality-alerts/ # /docs/grafana//alerting/best-practices/high-cardinality-alerts/ +canonical: https://grafana.com/docs/grafana/latest/alerting/examples/high-cardinality-alerts/ description: Learn how to detect and alert on high-cardinality metrics that can overload your metrics backend and increase observability costs. keywords: - grafana @@ -8,7 +10,7 @@ labels: - cloud - enterprise - oss -menuTitle: Examples of high-cardinality alerts +menuTitle: High-cardinality alerts title: Examples of high-cardinality alerts weight: 1105 refs: diff --git a/docs/sources/alerting/best-practices/multi-dimensional-alerts.md b/docs/sources/alerting/examples/multi-dimensional-alerts.md similarity index 96% rename from docs/sources/alerting/best-practices/multi-dimensional-alerts.md rename to docs/sources/alerting/examples/multi-dimensional-alerts.md index f612df6aa37..fa6d7dd04c1 100644 --- a/docs/sources/alerting/best-practices/multi-dimensional-alerts.md +++ b/docs/sources/alerting/examples/multi-dimensional-alerts.md @@ -1,5 +1,7 @@ --- -canonical: https://grafana.com/docs/grafana/latest/alerting/best-practices/multi-dimensional-alerts/ +aliases: + - ../best-practices/multi-dimensional-alerts/ # /docs/grafana//alerting/best-practices/multi-dimensional-alerts/ +canonical: https://grafana.com/docs/grafana/latest/alerting/examples/multi-dimensional-alerts/ description: This example shows how a single alert rule can generate multiple alert instances using time series data. keywords: - grafana @@ -8,7 +10,7 @@ labels: - cloud - enterprise - oss -menuTitle: Examples of multi-dimensional alerts +menuTitle: Multi-dimensional alerts title: Example of multi-dimensional alerts on time series data weight: 1101 refs: diff --git a/docs/sources/alerting/best-practices/table-data.md b/docs/sources/alerting/examples/table-data.md similarity index 96% rename from docs/sources/alerting/best-practices/table-data.md rename to docs/sources/alerting/examples/table-data.md index a39530ef5a8..342475cb7fa 100644 --- a/docs/sources/alerting/best-practices/table-data.md +++ b/docs/sources/alerting/examples/table-data.md @@ -1,5 +1,7 @@ --- -canonical: https://grafana.com/docs/grafana/latest/alerting/best-practices/table-data +aliases: + - ../best-practices/table-data/ # /docs/grafana//alerting/best-practices/table-data/ +canonical: https://grafana.com/docs/grafana/latest/alerting/examples/table-data description: This example shows how to create an alert rule using table data. keywords: - grafana @@ -8,7 +10,7 @@ labels: - cloud - enterprise - oss -menuTitle: Examples of table data +menuTitle: Table data title: Example of alerting on tabular data weight: 1102 refs: diff --git a/docs/sources/alerting/best-practices/trace-based-alerts.md b/docs/sources/alerting/examples/trace-based-alerts.md similarity index 98% rename from docs/sources/alerting/best-practices/trace-based-alerts.md rename to docs/sources/alerting/examples/trace-based-alerts.md index d908ecd7385..974089b4643 100644 --- a/docs/sources/alerting/best-practices/trace-based-alerts.md +++ b/docs/sources/alerting/examples/trace-based-alerts.md @@ -1,5 +1,7 @@ --- -canonical: https://grafana.com/docs/grafana/latest/alerting/best-practices/trace-based-alerts/ +aliases: + - ../best-practices/trace-based-alerts/ # /docs/grafana//alerting/best-practices/trace-based-alerts/ +canonical: https://grafana.com/docs/grafana/latest/alerting/examples/trace-based-alerts/ description: This guide provides introductory examples and distinct approaches for setting up trace-based alerts in Grafana. keywords: - grafana @@ -8,7 +10,7 @@ labels: - cloud - enterprise - oss -title: Examples of trace-based alerts +title: Trace-based alerts weight: 1103 refs: testdata-data-source: diff --git a/docs/sources/alerting/best-practices/tutorials.md b/docs/sources/alerting/examples/tutorials.md similarity index 87% rename from docs/sources/alerting/best-practices/tutorials.md rename to docs/sources/alerting/examples/tutorials.md index fbef5f36797..97fa7d6a754 100644 --- a/docs/sources/alerting/best-practices/tutorials.md +++ b/docs/sources/alerting/examples/tutorials.md @@ -1,5 +1,7 @@ --- -canonical: https://grafana.com/docs/grafana/latest/alerting/best-practices/tutorials/ +aliases: + - ../best-practices/tutorials/ # /docs/grafana//alerting/best-practices/tutorials/ +canonical: https://grafana.com/docs/grafana/latest/alerting/examples/tutorials/ description: This section provides a set of step-by-step tutorials guides to get started with Grafana Aletings. keywords: - grafana diff --git a/docs/sources/alerting/guides/_index.md b/docs/sources/alerting/guides/_index.md new file mode 100644 index 00000000000..69875878533 --- /dev/null +++ b/docs/sources/alerting/guides/_index.md @@ -0,0 +1,35 @@ +--- +canonical: https://grafana.com/docs/grafana/latest/alerting/guides/ +description: This section provides a set of guides for useful alerting practices and recommendations +keywords: + - grafana +labels: + products: + - cloud + - enterprise + - oss +menuTitle: Guides +title: Guides +weight: 170 +refs: + examples: + - pattern: /docs/grafana/ + destination: /docs/grafana//alerting/examples/ + - pattern: /docs/grafana-cloud/ + destination: /docs/grafana-cloud/alerting-and-irm/alerting/examples/ + tutorials: + - pattern: /docs/grafana/ + destination: /docs/grafana//alerting/examples/tutorials/ + - pattern: /docs/grafana-cloud/ + destination: /docs/grafana-cloud/alerting-and-irm/alerting/examples/tutorials/ +--- + +# Guides + +Guides in the Grafana Alerting documentation provide best practices and practical recommendations to help you move from a basic alerting setup to real-world use cases. + +These guides cover topics such as: + +{{< section >}} + +For more hands-on examples, refer to [Examples](ref:examples) and [Tutorials](ref:tutorials). diff --git a/docs/sources/alerting/guides/best-practices.md b/docs/sources/alerting/guides/best-practices.md new file mode 100644 index 00000000000..d1010390569 --- /dev/null +++ b/docs/sources/alerting/guides/best-practices.md @@ -0,0 +1,201 @@ +--- +aliases: + - ../best-practices/ # /docs/grafana//alerting/best-practices/ +canonical: https://grafana.com/docs/grafana/latest/alerting/guides/best-practices/ +description: Designing and configuring an effective alerting system takes time. This guide focuses on building alerting systems that scale with real-world operations. +keywords: + - grafana + - alerting + - guide +labels: + products: + - cloud + - enterprise + - oss +menuTitle: Best practices +title: Best practices +weight: 1010 +refs: + recovery-threshold: + - pattern: /docs/grafana/ + destination: /docs/grafana//alerting/fundamentals/alert-rules/queries-conditions/#recovery-threshold + - pattern: /docs/grafana-cloud/ + destination: /docs/grafana-cloud/alerting-and-irm/fundamentals/alert-rules/queries-conditions/#recovery-threshold + keep-firing-for: + - pattern: /docs/grafana/ + destination: /docs/grafana//alerting/fundamentals/alert-rule-evaluation/#keep-firing-for + - pattern: /docs/grafana-cloud/ + destination: /docs/grafana-cloud/alerting-and-irm/fundamentals/alert-rule-evaluation/#keep-firing-for + pending-period: + - pattern: /docs/grafana/ + destination: /docs/grafana//alerting/fundamentals/alert-rule-evaluation/#pending-period + - pattern: /docs/grafana-cloud/ + destination: /docs/grafana-cloud/alerting-and-irm/fundamentals/alert-rule-evaluation/#pending-period + silences: + - pattern: /docs/grafana/ + destination: /docs/grafana//alerting/configure-notifications/create-silence/ + - pattern: /docs/grafana-cloud/ + destination: /docs/grafana-cloud/alerting-and-irm/alerting/configure-notifications/create-silence/ + timing-options: + - pattern: /docs/grafana/ + destination: /docs/grafana//alerting/fundamentals/notifications/group-alert-notifications/#timing-options + - pattern: /docs/grafana-cloud/ + destination: /docs/grafana-cloud/alerting-and-irm/alerting/fundamentals/notifications/group-alert-notifications/#timing-options + group-alert-notifications: + - pattern: /docs/grafana/ + destination: /docs/grafana//alerting/fundamentals/notifications/group-alert-notifications/ + - pattern: /docs/grafana-cloud/ + destination: /docs/grafana-cloud/alerting-and-irm/alerting/fundamentals/notifications/group-alert-notifications/ + notification-policies: + - pattern: /docs/grafana/ + destination: /docs/grafana//alerting/fundamentals/notifications/notification-policies/ + - pattern: /docs/grafana-cloud/ + destination: /docs/grafana-cloud/alerting-and-irm/alerting/fundamentals/notifications/notification-policies/ + annotations: + - pattern: /docs/grafana/ + destination: /docs/grafana//alerting/fundamentals/alert-rules/annotation-label/#annotations + - pattern: /docs/grafana-cloud/ + destination: /docs/grafana-cloud/alerting-and-irm/alerting/fundamentals/alert-rules/annotation-label/#annotations + multi-dimensional-alerts: + - pattern: /docs/grafana/ + destination: /docs/grafana//alerting/examples/multi-dimensional-alerts/ + - pattern: /docs/grafana-cloud/ + destination: /docs/grafana-cloud/alerting-and-irm/alerting/examples/multi-dimensional-alerts/ +--- + +# Alerting best practices + +Designing and configuring an effective alerting system takes time. This guide focuses on building alerting systems that scale with real-world operations. + +The practices described here are intentionally high-level and apply regardless of tooling. Whether you use Prometheus, Grafana Alerting, or another stack, the same constraints apply: complex systems, imperfect signals, and humans on call. + +Alerting is never finished. It evolves with incidents, organizational changes, and the systems it’s meant to protect. + +{{< shared id="alert-planning-fundamentals" >}} + +## Prioritize symptoms, but don’t ignore infrastructure signals + +Alerts should primarily detect user-facing failures, not internal component behavior. Users don't care that a pod restarted; they care when the application is slow or failing. Symptom-based alerts tie directly to user impact. + +Reliability metrics that impact users—latency, errors, availability—are better paging signals than infrastructure events or internal errors. + +That said, infrastructure signals still matter. They can act as early warning indicators and are often useful when alerting maturity is low. A sustained spike in CPU or memory usage might not justify a page, but it can help explain or anticipate symptom-based failures. + +Infrastructure alerts tend to be noisy and are often ignored when treated like paging signals. They are usually better suited for lower-severity channels such as dashboards, alert lists, or non-paging destinations like a dedicated Slack channel, where they can be monitored without interrupting on-call. + +The key is balance as your alerting matures. Use infrastructure alerts to support diagnosis and prevention, not as a replacement for symptom-based alerts. + +## Escalate priority based on confidence + +Alert priority is often tied to user impact and the urgency to respond, but confidence should determine when escalation is necessary. + +In this context, escalation defines how responders are notified as confidence grows. This can include increasing alert priority, widening notification, paging additional responders, or opening an incident once intervention is clearly required. + +Early signals are often ambiguous, and confidence in a non-transient failure is usually low. Paging too early creates noise; paging too late means users are impacted for longer before anyone acts. A small or sudden increase in latency may not justify immediate action, but it can indicate a failure in progress. + +Confidence increases as signals become stronger or begin to correlate. + +Escalation is justified when issues are sustained or reinforced by multiple signals. For example, high latency combined with a rising error rate, or the same event firing over a sustained period. These patterns reduce the chance of transient noise and increase the likelihood of real impact. + +Use confidence in user impact to drive escalation and avoid unnecessary pages. + +## Scope alerts for scalability and actionability + +In distributed systems, avoid creating separate alert rules for every host, service, or endpoint. Instead, define alert rules that scale automatically using [multi-dimensional alert rules](ref:multi-dimensional-alerts). This reduces rule duplication and allows alerting to scale as the system grows. + +Start simple. Default to a single dimension such as `service` or `endpoint` to keep alerts manageable. Add dimensions only when they improve actionability. For example, when missing a dimension like `region` hides failures or doesn't provide enough information to act quickly. + +Additional dimensions like `region` or `instance` can help identify the root cause, but more isn't always better. + +## Design alerts for first responders and clear actions + +Alerts should be designed for the first responder, not the person who created the alert. Anyone on call should be able to understand what's wrong and what to do next without deep knowledge of the system or alert configuration. + +Avoid vague alerts that force responders to spend time figuring out context. Every alert should clearly explain why it exists, what triggered it, and how to investigate. Use [annotations](ref:annotations) to link to relevant dashboards and runbooks, which are essential for faster resolution. + +Alerts should indicate a real problem and be actionable, even if the impact is low. Informational alerts add noise without improving reliability. + +If no action is possible, it shouldn't be an alert—consider using a dashboard instead. Over time, alerts behave like technical debt: easy to create, costly to maintain, and hard to remove. + +Review alerts often and remove those that don’t lead to action. + +## Alerts should have an owner and system scope + +Alerts without ownership are often ignored. Every alert must have an owner: a team responsible for maintaining the alert and responding when it fires. + +Alerts must also define a system scope, such as a service or infrastructure component. Scope provides organizational context and connects alerts with ownership. Defining clear scopes is easier when services are treated as first-class entities, and organizations are built around service ownership. + +> [Service Center in Grafana Cloud](/docs/grafana-cloud/alerting-and-irm/service-center/) can help operate a service-oriented view of your system and align alert scope with ownership. + +After scope, ownership, and alert priority are defined, routing determines where alerts go and how they escalate. **Notification routing is as important as the alerts**. + +Alerts should be delivered to the right team and channel based on priority, ownership, and team workflows. Use [notification policies](ref:notification-policies) to define a routing tree that matches the context of your service or scope: + +- Define a parent policy for default routing within the scope. +- Define nested policies for specific cases or higher-priority issues. + +## Prevent notification overload with alert grouping + +Without alert grouping, responders can receive many notifications for the same underlying problem. + +For example, a database failure can trigger several alerts at the same time like increased latency, higher error rates, and internal errors. Paging separately for each symptom quickly turns into notification spam, even though there is a single root cause. + +[Notification grouping](ref:group-alert-notifications) consolidates related alerts into a single notification. Instead of receiving multiple pages for the same issue, responders get one alert that represents the incident and includes all related firing alerts. + +Grouping should follow operational boundaries such as service or owner, as defined by notification policies. Downstream or cascading failures should be grouped together so they surface as one issue rather than many. + +## Mitigate flapping alerts + +Short-lived failure spikes often trigger alerts that auto-resolve quickly. Alerting on transient failures creates noise and leads responders to ignore them. + +Require issues to persist before alerting. Set a [pending period](ref:pending-period) to define how long a condition must remain true before firing. For example, instead of alerting immediately on high error rate, require it to stay above the threshold for some minutes. + +Also, stabilize alerts by tuning query ranges and aggregations. Using raw data makes alerts sensitive to noise. Instead, evaluate over a time window and aggregate the data to smooth short spikes. + +```promql +# Reacts to transient spikes. Avoid this. +cpu_usage > 90 + +# Smooth fluctuations. +avg_over_time(cpu_usage[5m]) > 90 +``` + +For latency and error-based alerts, percentiles are often more useful than averages: + +```promql +quantile_over_time(0.95, http_duration_seconds[5m]) > 3 +``` + +Finally, avoid rapid resolve-and-fire notifications by using [`keep_firing_for`](ref:keep-firing-for) or [recovery thresholds](ref:recovery-threshold) to keep alerts active briefly during recovery. Both options reduce flapping and unnecessary notifications. + +## Graduate symptom-based alerts into SLOs + +When a symptom-based alert fires frequently, it usually indicates a reliability concern that should be measured and managed more deliberately. This is often a sign that the alert could evolve into an [SLO](/docs/grafana-cloud/alerting-and-irm/slo/). + +Traditional alerts create pressure to react immediately, while error budgets introduce a buffer of time to act, changing how urgency is handled. Alerts can then be defined in terms of error budget burn rate rather than reacting to every minor deviation. + +SLOs also align distinct teams around common reliability goals by providing a shared definition of what "good" looks like. They help consolidate multiple symptom alerts into a single user-facing objective. + +For example, instead of several teams alerting on high latency, a single SLO can be used across teams to capture overall API performance. + +## Integrate alerting into incident post-mortems + +Every incident is an opportunity to improve alerting. After each incident, evaluate whether alerts helped responders act quickly or added unnecessary noise. + +Assess which alerts fired, and how they influenced incident response. Review whether alerts triggered too late, too early, or without enough context, and adjust thresholds, priority, or escalation based on what actually happened. + +Use [silences](ref:silences) during active incidents to reduce repeated notifications, but scope them carefully to avoid silencing unrelated alerts. + +Post-mortems should evaluate alerts with root causes and lessons learned. If responders lacked key information during the incident, enrich alerts with additional context, dashboards, or better guidance. + +## Alerts should be continuously improved + +Alerting is an iterative process. Alerts that aren’t reviewed and refined lose effectiveness as systems and traffic patterns change. + +Schedule regular reviews of existing alerts. Remove alerts that don’t lead to action, and tune alerts or thresholds that fire too often without providing useful signal. Reduce false positives to combat alert fatigue. + +Prioritize clarity and simplicity in alert design. Simpler alerts are easier to understand, maintain, and trust under pressure. Favor fewer high-quality, actionable alerts over a large number of low-value ones. + +Use dashboards and observability tools for investigation, not alerts. + +{{< /shared >}} diff --git a/docs/sources/alerting/best-practices/connectivity-errors.md b/docs/sources/alerting/guides/connectivity-errors.md similarity index 98% rename from docs/sources/alerting/best-practices/connectivity-errors.md rename to docs/sources/alerting/guides/connectivity-errors.md index 42d992b0cb0..18e12f5bc49 100644 --- a/docs/sources/alerting/best-practices/connectivity-errors.md +++ b/docs/sources/alerting/guides/connectivity-errors.md @@ -1,5 +1,7 @@ --- -canonical: https://grafana.com/docs/grafana/latest/alerting/best-practices/connectivity-errors/ +aliases: + - ../best-practices/connectivity-errors/ # /docs/grafana//alerting/best-practices/connectivity-errors/ +canonical: https://grafana.com/docs/grafana/latest/alerting/guides/connectivity-errors/ description: Learn how to detect and handle connectivity issues in alerts using Prometheus, Grafana Alerting, or both. keywords: - grafana @@ -14,7 +16,7 @@ labels: - oss menuTitle: Handle connectivity errors title: Handle connectivity errors in alerts -weight: 1010 +weight: 1020 refs: pending-period: - pattern: /docs/grafana/ diff --git a/docs/sources/alerting/best-practices/missing-data.md b/docs/sources/alerting/guides/missing-data.md similarity index 98% rename from docs/sources/alerting/best-practices/missing-data.md rename to docs/sources/alerting/guides/missing-data.md index 06346ce3d05..c2d2500cdfd 100644 --- a/docs/sources/alerting/best-practices/missing-data.md +++ b/docs/sources/alerting/guides/missing-data.md @@ -1,5 +1,7 @@ --- -canonical: https://grafana.com/docs/grafana/latest/alerting/best-practices/missing-data/ +aliases: + - ../best-practices/missing-data/ # /docs/grafana//alerting/best-practices/missing-data/ +canonical: https://grafana.com/docs/grafana/latest/alerting/guides/missing-data/ description: Learn how to detect missing metrics and design alerts that handle gaps in data in Prometheus and Grafana Alerting. keywords: - grafana @@ -14,7 +16,7 @@ labels: - oss menuTitle: Handle missing data title: Handle missing data in Grafana Alerting -weight: 1020 +weight: 1030 refs: connectivity-errors-guide: - pattern: /docs/grafana/