video-iac/k8s/webrtc-be/templates/alerts.yaml

44 lines
1.8 KiB
YAML

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: webrtc-be-log-alerts
labels:
app: webrtc-be
release: prometheus-stack
spec:
groups:
- name: webrtc-be.alerts
rules:
# - alert: WebrtcBeError
# expr: 'sum(count_over_time({container="webrtc-be", namespace="webrtc-be"} |= "error" [5m])) > 0'
# for: 1m
# labels:
# severity: critical
# annotations:
# summary: "Errors found in webrtc-be logs"
# description: "The webrtc-be container is logging errors. Please check the logs."
# loki_link: >-
# {{ .Values.grafana.externalUrl }}/explore?orgId=1&left=["now-1h","now","Loki",{"expr":"{container=\"webrtc-be\", namespace=\"webrtc-be\"}"}]
- alert: WebrtcBeCrashed
expr: increase(kube_pod_container_status_restarts_total{container="webrtc-be", namespace="webrtc-be"}[5m]) > 0
# Instant alert - no 'for' duration
labels:
severity: critical
annotations:
summary: "webrtc-be crashed"
description: "The webrtc-be pod has crashed. Please check the logs."
loki_link: >-
{{ .Values.grafana.externalUrl }}/grafana/d/loki-logs-fixed-v10/loki-logs-fixed-v10?var-namespace={{ .Release.Namespace }}&var-container=webrtc-be&var-logs=loki&var-level=$__all
- alert: WebrtcBeDown
expr: kube_deployment_status_replicas_available{deployment="webrtc-be", namespace="webrtc-be"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "webrtc-be is down"
description: "The webrtc-be service has been unavailable for more than 1 minute."
loki_link: >-
{{ .Values.grafana.externalUrl }}/grafana/d/loki-logs-fixed-v10/loki-logs-fixed-v10?var-namespace={{ .Release.Namespace }}&var-container=webrtc-be&var-logs=loki&var-level=$__all