diff --git a/k8s/monitoring/templates/node-alerts.yaml b/k8s/monitoring/templates/node-alerts.yaml index 930f190..5954d50 100644 --- a/k8s/monitoring/templates/node-alerts.yaml +++ b/k8s/monitoring/templates/node-alerts.yaml @@ -6,10 +6,29 @@ metadata: app: kube-prometheus-stack app.kubernetes.io/instance: {{ .Release.Name }} spec: - groups: + groups: - name: node.alerts rules: - - alert: NodeHighCPU + - alert: InternalTestAlert + expr: vector(1) + for: 0m + labels: + severity: critical + annotations: + summary: "Internal Alert Pipeline Test" + description: "This alert is manually triggered to verify the Slack alerting pipeline." + + - alert: NodeMissingWorkloadLabel + expr: | + count by (instance) (node_cpu_seconds_total) unless count by (instance) (node_cpu_seconds_total{workload=~".+"}) + for: 5m + labels: + severity: warning + annotations: + summary: "Node missing workload label on metrics" + description: "Metrics for instance {{ "{{" }} $labels.instance {{ "}}" }} are missing the 'workload' label, which is required for NodeHighCPU alerts." + + - alert: MediaNodeHighCPU expr: | ( (1 - avg without (cpu, mode) (rate(node_cpu_seconds_total{mode="idle", workload="media"}[1m]))) * 100 > {{ .Values.cpuThresholdMedia | default 65 }} @@ -23,4 +42,4 @@ spec: severity: warning annotations: summary: "High CPU usage on node {{ "{{" }} $labels.instance {{ "}}" }}" - description: "Node {{ "{{" }} $labels.instance {{ "}}" }} has CPU usage above threshold (current value: {{ "{{" }} $value | printf \"%.2f\" {{ "}}" }}%)" + description: "Node {{ "{{" }} $labels.instance {{ "}}" }} (workload: {{ "{{" }} $labels.workload {{ "}}" }}) has CPU usage above threshold (current value: {{ "{{" }} $value | printf \"%.2f\" {{ "}}" }}%)" diff --git a/k8s/monitoring/values-production.yaml b/k8s/monitoring/values-production.yaml index 6841285..e6f6724 100644 --- a/k8s/monitoring/values-production.yaml +++ b/k8s/monitoring/values-production.yaml @@ -170,6 +170,15 @@ kube-prometheus-stack: repeat_interval: 12h receiver: 'null' routes: + - match: + alertname: InternalTestAlert + receiver: 'slack-notifications' + - match: + alertname: MediaNodeHighCPU + receiver: 'slack-notifications' + - match: + alertname: NodeMissingWorkloadLabel + receiver: 'slack-notifications' - match: alertname: NodeHighCPU receiver: 'slack-notifications'