video-iac/k8s/monitoring/values-production.yaml

298 lines
10 KiB
YAML
Raw Normal View History

2021-11-18 07:15:38 +00:00
# Helm chart values for Prometheus Operator with HTTPS and basic auth
2025-10-05 22:43:16 +00:00
# Explicitly enable RBAC resource creation
rbac:
create: true
2026-01-09 00:24:29 +00:00
cpuThresholdMedia: 65
2026-01-08 12:12:36 +00:00
cpuThresholdOther: 80
2025-10-05 22:43:16 +00:00
2021-11-18 07:15:38 +00:00
kube-prometheus-stack:
2026-01-08 10:55:26 +00:00
prometheus:
prometheusSpec:
nodeSelector:
workload: infra
grafana:
nodeSelector:
workload: infra
alertmanager:
alertmanagerSpec:
nodeSelector:
workload: infra
crds:
enabled: false
2025-10-05 22:43:16 +00:00
# Disable the default ServiceMonitor configuration paths to prevent duplicates
prometheus-node-exporter:
serviceMonitor:
enabled: false
2025-08-02 16:42:45 +00:00
nodeExporter:
serviceMonitor:
2025-10-05 22:43:16 +00:00
enabled: false
2021-11-18 07:15:38 +00:00
prometheus:
ingress:
enabled: true
pathType: Prefix
2021-11-18 07:15:38 +00:00
annotations:
kubernetes.io/ingress.class: nginx
#nginx.ingress.kubernetes.io/rewrite-target: /$2
2021-11-18 07:15:38 +00:00
cert-manager.io/cluster-issuer: letsencrypt-nginx-production
nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
nginx.ingress.kubernetes.io/auth-type: basic
nginx.ingress.kubernetes.io/auth-secret: monitoring-basic-auth
nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
2021-11-18 07:15:38 +00:00
hosts:
- monitoring.video.jamkazam.com
paths:
- /prometheus
2021-11-18 07:15:38 +00:00
tls:
- secretName: monitoring
hosts:
- monitoring.video.jamkazam.com
prometheusSpec:
retention: 60d
retentionSize: 20GB
routePrefix: /prometheus
2021-11-18 07:15:38 +00:00
externalUrl: https://monitoring.video.jamkazam.com/prometheus
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: linode-block-storage-retain
resources:
requests:
storage: 30Gi
2025-10-05 22:43:16 +00:00
# 2. !!! CRUCIAL: Ensure the default ServiceMonitor is ignored !!!
# This prevents duplicate metrics by telling Prometheus to ignore the default SM.
serviceMonitorSelector:
matchExpressions:
# Exclude the default node-exporter ServiceMonitor
- key: app.kubernetes.io/name
operator: NotIn
values:
# Use the label identified above
- prometheus-node-exporter
2025-11-17 00:03:34 +00:00
serviceMonitorNamespaceSelector:
matchExpressions:
- key: kubernetes.io/metadata.name
operator: In
values:
- monitoring # Its own namespace
- webrtc-be # Your app's namespace
2025-12-14 05:31:41 +00:00
# Enable discovery of PrometheusRules in these namespaces
2025-12-14 13:39:53 +00:00
ruleNamespaceSelector: {} # Match all namespaces (avoids dependency on namespace labels)
2025-12-14 05:31:41 +00:00
ruleSelector:
matchExpressions: [] # Match all rules in selected namespaces
2025-10-05 22:43:16 +00:00
# Add the manual scrape configuration
additionalScrapeConfigs:
- job_name: 'node-exporter'
kubernetes_sd_configs:
2026-01-08 21:57:28 +00:00
- role: node
2025-10-05 22:43:16 +00:00
relabel_configs:
2026-01-08 21:57:28 +00:00
# 1. Pull all node labels (including workload)
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
2026-01-08 21:57:28 +00:00
# 2. Target the node-exporter port (9100) on the node's IP.
# role: node discovery defaults to the Kubelet port (10250).
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
# 3. Set standard labels for dashboard and alert compatibility
- source_labels: [__meta_kubernetes_node_name]
2025-10-05 22:43:16 +00:00
target_label: instance
action: replace
2026-01-08 21:57:28 +00:00
- source_labels: [__meta_kubernetes_node_name]
target_label: node
action: replace
2026-01-08 21:57:28 +00:00
- target_label: namespace
replacement: monitoring
- target_label: pod
replacement: node-exporter-discovery
2021-11-18 07:15:38 +00:00
alertmanager:
ingress:
enabled: true
pathType: Prefix
2021-11-18 07:15:38 +00:00
annotations:
kubernetes.io/ingress.class: nginx
#nginx.ingress.kubernetes.io/rewrite-target: /$2
2021-11-18 07:15:38 +00:00
cert-manager.io/cluster-issuer: letsencrypt-nginx-production
nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
nginx.ingress.kubernetes.io/auth-type: basic
nginx.ingress.kubernetes.io/auth-secret: monitoring-basic-auth
nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
2021-11-18 07:15:38 +00:00
hosts:
- monitoring.video.jamkazam.com
paths:
- /alertmanager
2021-11-18 07:15:38 +00:00
tls:
- secretName: monitoring
hosts:
- monitoring.video.jamkazam.com
alertmanagerSpec:
routePrefix: /alertmanager
externalUrl: https://monitoring.video.jamkazam.com/alertmanager
2021-11-18 07:15:38 +00:00
storage:
volumeClaimTemplate:
spec:
storageClassName: linode-block-storage-retain
resources:
requests:
storage: 30Gi
2021-11-18 07:15:38 +00:00
2025-12-06 23:42:59 +00:00
config:
global:
resolve_timeout: 5m
smtp_smarthost: 'email-smtp.us-east-1.amazonaws.com:587'
smtp_from: 'support@jamkazam.com'
2025-12-14 16:58:46 +00:00
smtp_auth_username: 'AKIA2SXEHOQFM326T4WJ'
smtp_auth_password: 'BM6zKJUOWSc4XF+1dXZZlqAkbybGX+KbY+YciI7PIcsn'
2025-12-06 23:42:59 +00:00
smtp_require_tls: true
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'null'
routes:
2026-01-08 14:34:48 +00:00
- match:
alertname: MediaNodeHighCPU
receiver: 'slack-notifications'
- match:
alertname: NodeMissingWorkloadLabel
receiver: 'slack-notifications'
2026-01-06 13:53:33 +00:00
- match:
alertname: NodeHighCPU
receiver: 'slack-notifications'
2025-12-13 21:08:44 +00:00
- match:
2025-12-06 23:42:59 +00:00
alertname: WebrtcBeCrashed
2025-12-13 20:53:22 +00:00
receiver: 'email-and-slack-notifications'
2025-12-14 13:26:36 +00:00
- match:
alertname: WebrtcBeDown
receiver: 'email-and-slack-notifications'
2025-12-13 20:53:22 +00:00
- match:
alertname: WebrtcBeError
receiver: 'email-and-slack-notifications'
2026-01-06 12:30:31 +00:00
- match:
alertname: PodOOMKilled
receiver: 'slack-notifications-oom'
- match:
alertname: PodCrashLoopBackOff
receiver: 'slack-notifications'
2025-12-06 23:42:59 +00:00
receivers:
- name: 'null'
- name: 'email-alerts'
email_configs:
- to: 'alerts@jamkazam.com'
send_resolved: true
2025-12-13 20:53:22 +00:00
- name: 'slack-notifications'
slack_configs:
2025-12-14 14:51:59 +00:00
- api_url: 'https://hooks.slack.com/services/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'
channel: '#video-cluster-prd-alerts'
2025-12-13 20:53:22 +00:00
send_resolved: true
2026-01-06 12:30:31 +00:00
title: '[PRODUCTION] [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}
- name: 'slack-notifications-oom'
slack_configs:
- api_url: 'https://hooks.slack.com/services/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'
channel: '#video-cluster-prd-alerts'
send_resolved: false
title: '[PRODUCTION] [OOM KILLED] Monitoring Event Notification'
2025-12-13 20:53:22 +00:00
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}
- name: 'email-and-slack-notifications'
email_configs:
- to: 'alerts@jamkazam.com'
send_resolved: true
headers:
Subject: '[PRODUCTION] {{ .Status | toUpper }} - {{ range .Alerts }}{{ .Annotations.summary }} {{ end }}'
html: '{{ template "email.default.html" . }}<br/><hr/>View in <a href="{{ .ExternalURL }}">Alertmanager</a><br/>{{ range .Alerts }}{{ if .Annotations.loki_link }}<a href="{{ .Annotations.loki_link }}&from={{ .StartsAt.Unix | add -60 | mul 1000 }}&to={{ .StartsAt.Unix | add 60 | mul 1000 }}">View Logs in Loki</a>{{ end }}{{ end }}'
2025-12-13 20:53:22 +00:00
slack_configs:
2025-12-14 14:51:59 +00:00
- api_url: 'https://hooks.slack.com/services/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'
2025-12-14 04:57:18 +00:00
channel: '#video-cluster-prd-alerts'
2025-12-13 20:53:22 +00:00
send_resolved: true
title: '[PRODUCTION] [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ if .Annotations.loki_link }}
*Logs:* <{{ .Annotations.loki_link }}&from={{ .StartsAt.Unix | add -60 | mul 1000 }}&to={{ .StartsAt.Unix | add 60 | mul 1000 }}|View in Loki>
2025-12-13 20:53:22 +00:00
{{ end }}
{{ end }}
*Source:* <{{ .ExternalURL }}|Alertmanager>
2021-11-18 07:15:38 +00:00
grafana:
persistence:
enabled: true
storageClassName: linode-block-storage-retain
size: 30Gi
2021-11-18 07:15:38 +00:00
ingress:
enabled: true
pathType: Prefix
2021-11-18 07:15:38 +00:00
annotations:
kubernetes.io/ingress.class: nginx
#nginx.ingress.kubernetes.io/rewrite-target: /$2
2021-11-18 07:15:38 +00:00
cert-manager.io/cluster-issuer: letsencrypt-nginx-production
nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
#nginx.ingress.kubernetes.io/auth-type: basic
#nginx.ingress.kubernetes.io/auth-secret: monitoring-basic-auth
#nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
2021-11-18 07:15:38 +00:00
hosts:
- monitoring.video.jamkazam.com
path: /grafana
2021-11-18 07:15:38 +00:00
tls:
- secretName: monitoring
hosts:
- monitoring.video.jamkazam.com
grafana.ini:
server:
domain: monitoring.video.jamkazam.com
root_url: "%(protocol)s://%(domain)s/grafana/"
enable_gzip: "true"
serve_from_sub_path: true
2025-12-07 21:37:46 +00:00
sidecar:
dashboards:
enabled: true
label: grafana_dashboard
searchNamespace: ALL
2025-12-04 03:10:21 +00:00
additionalDataSources:
- name: Loki
type: loki
2025-12-12 04:57:03 +00:00
uid: loki
2025-12-04 03:10:21 +00:00
url: http://loki.loki.svc:3100
access: proxy
2021-11-18 07:15:38 +00:00
# Disable control plane metrics
kubeEtcd:
enabled: false
kubeControllerManager:
enabled: false
kubeScheduler:
enabled: false
kubelet:
serviceMonitor:
trackTimestampsStaleness: false