2025-04-01 10:38:02 +09:00

282 lines
8.5 KiB
Go

package prom
import (
"fmt"
"time"
"github.com/google/uuid"
"gopkg.in/yaml.v3"
"github.com/grafana/grafana/pkg/services/datasources"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/util"
)
const (
// ruleUIDLabel is a special label that can be used to set a custom UID for a Prometheus
// alert rule when converting it to a Grafana alert rule. If this label is not present,
// a stable UID will be generated automatically based on the rule's data.
ruleUIDLabel = "__grafana_alert_rule_uid__"
)
const (
queryRefID = "query"
prometheusMathRefID = "prometheus_math"
thresholdRefID = "threshold"
)
// Config defines the configuration options for the Prometheus to Grafana rules converter.
type Config struct {
DatasourceUID string
DatasourceType string
// DefaultInterval is the default interval for rules in the groups that
// don't have Interval set.
DefaultInterval time.Duration
FromTimeRange *time.Duration
EvaluationOffset *time.Duration
ExecErrState models.ExecutionErrorState
NoDataState models.NoDataState
RecordingRules RulesConfig
AlertRules RulesConfig
}
// RulesConfig contains configuration that applies to either recording or alerting rules.
type RulesConfig struct {
IsPaused bool
}
var (
defaultTimeRange = 600 * time.Second
defaultEvaluationOffset = 0 * time.Minute
defaultConfig = Config{
FromTimeRange: &defaultTimeRange,
EvaluationOffset: &defaultEvaluationOffset,
ExecErrState: models.ErrorErrState,
NoDataState: models.OK,
}
)
type Converter struct {
cfg Config
}
// NewConverter creates a new Converter instance with the provided configuration.
// It validates the configuration and returns an error if any required fields are missing
// or if the configuration is invalid.
func NewConverter(cfg Config) (*Converter, error) {
if cfg.DatasourceUID == "" {
return nil, fmt.Errorf("datasource UID is required")
}
if cfg.DatasourceType == "" {
return nil, fmt.Errorf("datasource type is required")
}
if cfg.DefaultInterval == 0 {
return nil, fmt.Errorf("default evaluation interval is required")
}
if cfg.FromTimeRange == nil {
cfg.FromTimeRange = defaultConfig.FromTimeRange
}
if cfg.EvaluationOffset == nil {
cfg.EvaluationOffset = defaultConfig.EvaluationOffset
}
if cfg.ExecErrState == "" {
cfg.ExecErrState = defaultConfig.ExecErrState
}
if cfg.NoDataState == "" {
cfg.NoDataState = defaultConfig.NoDataState
}
if cfg.DatasourceType != datasources.DS_PROMETHEUS && cfg.DatasourceType != datasources.DS_LOKI {
return nil, fmt.Errorf("invalid datasource type: %s", cfg.DatasourceType)
}
return &Converter{
cfg: cfg,
}, nil
}
// PrometheusRulesToGrafana converts a Prometheus rule group into Grafana Alerting rule group.
func (p *Converter) PrometheusRulesToGrafana(orgID int64, namespaceUID string, group PrometheusRuleGroup) (*models.AlertRuleGroup, error) {
for _, rule := range group.Rules {
if err := rule.Validate(); err != nil {
return nil, err
}
}
grafanaGroup, err := p.convertRuleGroup(orgID, namespaceUID, group)
if err != nil {
return nil, fmt.Errorf("failed to convert rule group '%s': %w", group.Name, err)
}
return grafanaGroup, nil
}
func (p *Converter) convertRuleGroup(orgID int64, namespaceUID string, promGroup PrometheusRuleGroup) (*models.AlertRuleGroup, error) {
uniqueNames := map[string]int{}
rules := make([]models.AlertRule, 0, len(promGroup.Rules))
interval := time.Duration(promGroup.Interval)
if interval == 0 {
interval = p.cfg.DefaultInterval
}
for i, rule := range promGroup.Rules {
gr, err := p.convertRule(orgID, namespaceUID, promGroup.Name, rule)
if err != nil {
return nil, fmt.Errorf("failed to convert Prometheus rule '%s' to Grafana rule: %w", rule.Alert, err)
}
gr.RuleGroupIndex = i + 1
gr.IntervalSeconds = int64(interval.Seconds())
// Check rule title uniqueness within the group.
uniqueNames[gr.Title]++
if val := uniqueNames[gr.Title]; val > 1 {
gr.Title = fmt.Sprintf("%s (%d)", gr.Title, val)
}
uid, err := getUID(orgID, namespaceUID, promGroup.Name, i, rule)
if err != nil {
return nil, fmt.Errorf("failed to generate UID for rule '%s': %w", gr.Title, err)
}
gr.UID = uid
rules = append(rules, gr)
}
result := &models.AlertRuleGroup{
FolderUID: namespaceUID,
Interval: int64(interval.Seconds()),
Rules: rules,
Title: promGroup.Name,
}
return result, nil
}
// getUID returns a UID for a Prometheus rule.
// If the rule has a special label its value is used.
// Otherwise, a stable UUID is generated by using a hash of the rule's data.
func getUID(orgID int64, namespaceUID string, group string, position int, promRule PrometheusRule) (string, error) {
if uid, ok := promRule.Labels[ruleUIDLabel]; ok {
if err := util.ValidateUID(uid); err != nil {
return "", fmt.Errorf("invalid UID label value: %s; %w", uid, err)
}
return uid, nil
}
// Generate stable UUID based on the orgID, namespace, group and position.
uidData := fmt.Sprintf("%d|%s|%s|%d", orgID, namespaceUID, group, position)
u := uuid.NewSHA1(uuid.NameSpaceOID, []byte(uidData))
return u.String(), nil
}
func (p *Converter) convertRule(orgID int64, namespaceUID, group string, rule PrometheusRule) (models.AlertRule, error) {
var forInterval time.Duration
if rule.For != nil {
forInterval = time.Duration(*rule.For)
}
var query []models.AlertQuery
var title string
var isPaused bool
var record *models.Record
var err error
isRecordingRule := rule.Record != ""
query, err = p.createQuery(rule.Expr, isRecordingRule)
if err != nil {
return models.AlertRule{}, err
}
if isRecordingRule {
record = &models.Record{
From: queryRefID,
Metric: rule.Record,
}
isPaused = p.cfg.RecordingRules.IsPaused
title = rule.Record
} else {
isPaused = p.cfg.AlertRules.IsPaused
title = rule.Alert
}
// Temporary workaround for avoiding the uniqueness check for the rule title.
// In Grafana alert rule titles must be unique within the same org and folder,
// but Prometheus allows multiple rules with the same name. By adding the group name
// to the title we ensure that the title is unique within the group.
// TODO: Remove this workaround when we have a proper solution for handling rule title uniqueness.
title = fmt.Sprintf("[%s] %s", group, title)
labels := make(map[string]string, len(rule.Labels)+1)
for k, v := range rule.Labels {
labels[k] = v
}
originalRuleDefinition, err := yaml.Marshal(rule)
if err != nil {
return models.AlertRule{}, fmt.Errorf("failed to marshal original rule definition: %w", err)
}
result := models.AlertRule{
OrgID: orgID,
NamespaceUID: namespaceUID,
Title: title,
Data: query,
Condition: query[len(query)-1].RefID,
NoDataState: p.cfg.NoDataState,
ExecErrState: p.cfg.ExecErrState,
Annotations: rule.Annotations,
Labels: labels,
For: forInterval,
RuleGroup: group,
IsPaused: isPaused,
Record: record,
Metadata: models.AlertRuleMetadata{
PrometheusStyleRule: &models.PrometheusStyleRule{
OriginalRuleDefinition: string(originalRuleDefinition),
},
},
}
return result, nil
}
// createQuery constructs the alert query nodes for a given Prometheus rule expression.
// It returns a slice of AlertQuery that represent the evaluation steps for the rule.
//
// For recording rules it generates a single query node that
// executes the PromQL query in the configured datasource.
//
// For alerting rules, it generates three query nodes:
// 1. Query Node (query): Executes the PromQL query using the configured datasource.
// 2. Math Node (prometheus_math): Applies a math expression "is_number($query) || is_nan($query) || is_inf($query)".
// 3. Threshold Node (threshold): Gets the result from the math node and checks that it's greater than 0.
//
// This is needed to ensure that we keep the Prometheus behaviour, where any returned result
// is considered alerting, and only when the query returns no data is the alert treated as normal.
func (p *Converter) createQuery(expr string, isRecordingRule bool) ([]models.AlertQuery, error) {
queryNode, err := createQueryNode(p.cfg.DatasourceUID, p.cfg.DatasourceType, expr, *p.cfg.FromTimeRange, *p.cfg.EvaluationOffset)
if err != nil {
return nil, err
}
if isRecordingRule {
return []models.AlertQuery{queryNode}, nil
}
mathNode, err := createMathNode()
if err != nil {
return nil, err
}
thresholdNode, err := createThresholdNode()
if err != nil {
return nil, err
}
return []models.AlertQuery{queryNode, mathNode, thresholdNode}, nil
}