diff --git a/k8s/monitoring/templates/node-alerts.yaml b/k8s/monitoring/templates/node-alerts.yaml index cbec3a7..016eaa0 100644 --- a/k8s/monitoring/templates/node-alerts.yaml +++ b/k8s/monitoring/templates/node-alerts.yaml @@ -3,17 +3,17 @@ kind: PrometheusRule metadata: name: cluster-node-alerts labels: - app: prometheus-stack - release: prometheus-stack + app: kube-prometheus-stack + app.kubernetes.io/instance: {{ .Release.Name }} spec: groups: - name: node.alerts rules: - alert: NodeHighCPU - expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 20 + expr: (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100 > 20 for: 2m labels: severity: warning annotations: - summary: "High CPU usage on node {{ $labels.instance }}" - description: "Node {{ $labels.instance }} has CPU usage above 20% (current value: {{ $value | printf \"%.2f\" }}%)" + summary: "High CPU usage on node {{ "{{" }} $labels.instance {{ "}}" }}" + description: "Node {{ "{{" }} $labels.instance {{ "}}" }} has CPU usage above 20% (current value: {{ "{{" }} $value | printf \"%.2f\" {{ "}}" }}%)" diff --git a/k8s/monitoring/templates/node-pool-health-dashboard.yaml b/k8s/monitoring/templates/node-pool-health-dashboard.yaml index 44bf705..b8b5819 100644 --- a/k8s/monitoring/templates/node-pool-health-dashboard.yaml +++ b/k8s/monitoring/templates/node-pool-health-dashboard.yaml @@ -50,8 +50,8 @@ data: }, "targets": [ { - "expr": "sum(node_cpu_seconds_total{mode!=\"idle\"}) by (workload) / sum(machine_cpu_cores) by (workload) * 100", - "legendFormat": "{{workload}} CPU %", + "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (workload) / sum(machine_cpu_cores) by (workload) * 100", + "legendFormat": "{{ "{{" }}workload{{ "}}" }} CPU %", "refId": "A" } ], @@ -73,7 +73,7 @@ data: "targets": [ { "expr": "sum(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) by (workload) / sum(node_memory_MemTotal_bytes) by (workload) * 100", - "legendFormat": "{{workload}} Mem %", + "legendFormat": "{{ "{{" }}workload{{ "}}" }} Mem %", "refId": "A" } ], @@ -87,7 +87,7 @@ data: "targets": [ { "expr": "sum(rate(kube_pod_container_status_restarts_total[5m])) by (namespace, pod, node) > 0", - "legendFormat": "{{namespace}}/{{pod}} ({{node}})", + "legendFormat": "{{ "{{" }}namespace{{ "}}" }}/{{ "{{" }}pod{{ "}}" }} ({{ "{{" }}node{{ "}}" }})", "refId": "A" } ], diff --git a/k8s/monitoring/templates/pod-alerts.yaml b/k8s/monitoring/templates/pod-alerts.yaml index 8e8e698..f5b4e2b 100644 --- a/k8s/monitoring/templates/pod-alerts.yaml +++ b/k8s/monitoring/templates/pod-alerts.yaml @@ -3,8 +3,8 @@ kind: PrometheusRule metadata: name: cluster-pod-alerts labels: - app: prometheus-stack - release: prometheus-stack + app: kube-prometheus-stack + app.kubernetes.io/instance: {{ .Release.Name }} spec: groups: - name: pod.alerts @@ -16,7 +16,7 @@ spec: severity: critical annotations: summary: "Pod OOMKilled" - description: "Container {{ $labels.container }} in pod {{ $labels.pod }} (namespace {{ $labels.namespace }}) was OOMKilled." + description: "Container {{ "{{" }} $labels.container {{ "}}" }} in pod {{ "{{" }} $labels.pod {{ "}}" }} (namespace {{ "{{" }} $labels.namespace {{ "}}" }}) was OOMKilled." - alert: PodCrashLoopBackOff expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0 @@ -25,4 +25,4 @@ spec: severity: critical annotations: summary: "Pod in CrashLoopBackOff" - description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is in CrashLoopBackOff." + description: "Pod {{ "{{" }} $labels.pod {{ "}}" }} in namespace {{ "{{" }} $labels.namespace {{ "}}" }} is in CrashLoopBackOff."