video-iac/k8s/monitoring/templates/node-pool-health-dashboard....

109 lines
3.1 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: node-pool-health-dashboard
labels:
grafana_dashboard: "1"
data:
node-pool-health.json: |
{
"annotations": {
"list": []
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"hideControls": false,
"id": null,
"iteration": 1583185057230,
"links": [],
"panels": [
{
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 70 },
{ "color": "red", "value": 90 }
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 0, "y": 0 },
"id": 1,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": ["mean"],
"fields": "",
"values": false
}
},
"targets": [
{
"expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (workload) / sum(machine_cpu_cores) by (workload) * 100",
"legendFormat": "{{ "{{" }}workload{{ "}}" }} CPU %",
"refId": "A"
}
],
"title": "CPU Utilization by Node Pool",
"type": "stat"
},
{
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"unit": "percent"
}
},
"gridPos": { "h": 4, "w": 12, "x": 12, "y": 0 },
"id": 2,
"options": {
"reduceOptions": { "calcs": ["mean"] }
},
"targets": [
{
"expr": "sum(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) by (workload) / sum(node_memory_MemTotal_bytes) by (workload) * 100",
"legendFormat": "{{ "{{" }}workload{{ "}}" }} Mem %",
"refId": "A"
}
],
"title": "Memory Utilization by Node Pool",
"type": "stat"
},
{
"datasource": "Prometheus",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 },
"id": 3,
"targets": [
{
"expr": "sum(rate(kube_pod_container_status_restarts_total[5m])) by (namespace, pod, node) > 0",
"legendFormat": "{{ "{{" }}namespace{{ "}}" }}/{{ "{{" }}pod{{ "}}" }} ({{ "{{" }}node{{ "}}" }})",
"refId": "A"
}
],
"title": "Pod Restarts (Last 5m)",
"type": "timeseries"
}
],
"refresh": "10s",
"schemaVersion": 30,
"style": "dark",
"tags": ["kubernetes", "infrastructure"],
"time": {
"from": "now-1h",
"to": "now"
},
"title": "Node Pool Health",
"uid": "node-pool-health"
}