video-iac/k8s/monitoring/values-production.yaml

286 lines
10 KiB
YAML

# Helm chart values for Prometheus Operator with HTTPS and basic auth
# Explicitly enable RBAC resource creation
rbac:
create: true
kube-prometheus-stack:
crds:
enabled: false
# Disable the default ServiceMonitor configuration paths to prevent duplicates
prometheus-node-exporter:
serviceMonitor:
enabled: false
nodeExporter:
serviceMonitor:
enabled: false
prometheus:
ingress:
enabled: true
pathType: Prefix
annotations:
kubernetes.io/ingress.class: nginx
#nginx.ingress.kubernetes.io/rewrite-target: /$2
cert-manager.io/cluster-issuer: letsencrypt-nginx-production
nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
nginx.ingress.kubernetes.io/auth-type: basic
nginx.ingress.kubernetes.io/auth-secret: monitoring-basic-auth
nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
hosts:
- monitoring.video.jamkazam.com
paths:
- /prometheus
tls:
- secretName: monitoring
hosts:
- monitoring.video.jamkazam.com
prometheusSpec:
retention: 60d
retentionSize: 20GB
routePrefix: /prometheus
externalUrl: https://monitoring.video.jamkazam.com/prometheus
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: linode-block-storage-retain
resources:
requests:
storage: 30Gi
# 2. !!! CRUCIAL: Ensure the default ServiceMonitor is ignored !!!
# This prevents duplicate metrics by telling Prometheus to ignore the default SM.
serviceMonitorSelector:
matchExpressions:
# Exclude the default node-exporter ServiceMonitor
- key: app.kubernetes.io/name
operator: NotIn
values:
# Use the label identified above
- prometheus-node-exporter
serviceMonitorNamespaceSelector:
matchExpressions:
- key: kubernetes.io/metadata.name
operator: In
values:
- monitoring # Its own namespace
- webrtc-be # Your app's namespace
# Enable discovery of PrometheusRules in these namespaces
ruleNamespaceSelector: {} # Match all namespaces (avoids dependency on namespace labels)
ruleSelector:
matchExpressions: [] # Match all rules in selected namespaces
# Add the manual scrape configuration
additionalScrapeConfigs:
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
# 1. Filter: Precisely target the node-exporter service in the monitoring namespace.
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name]
separator: '/'
# Assuming the service name is 'monitoring-prometheus-node-exporter'
regex: 'monitoring/monitoring-prometheus-node-exporter'
action: keep
# 2. Filter: Ensure we are targeting the standard port (usually 9100)
- source_labels: [__address__]
regex: '.*:9100$'
action: keep
# 3. THE FIX: Set the instance label correctly
- source_labels: [__meta_kubernetes_endpoint_node_name]
target_label: instance
action: replace
- source_labels: [__address__]
target_label: ip_address
action: replace
# 4. Replicate standard labels for dashboard compatibility
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
# Ensure standard labels are present for dashboard compatibility
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
- source_labels: [__meta_kubernetes_endpoint_node_name]
target_label: node
alertmanager:
ingress:
enabled: true
pathType: Prefix
annotations:
kubernetes.io/ingress.class: nginx
#nginx.ingress.kubernetes.io/rewrite-target: /$2
cert-manager.io/cluster-issuer: letsencrypt-nginx-production
nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
nginx.ingress.kubernetes.io/auth-type: basic
nginx.ingress.kubernetes.io/auth-secret: monitoring-basic-auth
nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
hosts:
- monitoring.video.jamkazam.com
paths:
- /alertmanager
tls:
- secretName: monitoring
hosts:
- monitoring.video.jamkazam.com
alertmanagerSpec:
routePrefix: /alertmanager
externalUrl: https://monitoring.video.jamkazam.com/alertmanager
storage:
volumeClaimTemplate:
spec:
storageClassName: linode-block-storage-retain
resources:
requests:
storage: 30Gi
config:
global:
resolve_timeout: 5m
smtp_smarthost: 'email-smtp.us-east-1.amazonaws.com:587'
smtp_from: 'support@jamkazam.com'
smtp_auth_username: 'AKIA2SXEHOQFM326T4WJ'
smtp_auth_password: 'BM6zKJUOWSc4XF+1dXZZlqAkbybGX+KbY+YciI7PIcsn'
smtp_require_tls: true
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'null'
routes:
- match:
alertname: NodeHighCPU
receiver: 'slack-notifications'
- match:
alertname: WebrtcBeCrashed
receiver: 'email-and-slack-notifications'
- match:
alertname: WebrtcBeDown
receiver: 'email-and-slack-notifications'
- match:
alertname: WebrtcBeError
receiver: 'email-and-slack-notifications'
- match:
alertname: PodOOMKilled
receiver: 'slack-notifications-oom'
- match:
alertname: PodCrashLoopBackOff
receiver: 'slack-notifications'
receivers:
- name: 'null'
- name: 'email-alerts'
email_configs:
- to: 'alerts@jamkazam.com'
send_resolved: true
- name: 'slack-notifications'
slack_configs:
- api_url: 'https://hooks.slack.com/services/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'
channel: '#video-cluster-prd-alerts'
send_resolved: true
title: '[PRODUCTION] [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}
- name: 'slack-notifications-oom'
slack_configs:
- api_url: 'https://hooks.slack.com/services/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'
channel: '#video-cluster-prd-alerts'
send_resolved: false
title: '[PRODUCTION] [OOM KILLED] Monitoring Event Notification'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}
- name: 'email-and-slack-notifications'
email_configs:
- to: 'alerts@jamkazam.com'
send_resolved: true
headers:
Subject: '[PRODUCTION] {{ .Status | toUpper }} - {{ range .Alerts }}{{ .Annotations.summary }} {{ end }}'
html: '{{ template "email.default.html" . }}<br/><hr/>View in <a href="{{ .ExternalURL }}">Alertmanager</a><br/>{{ range .Alerts }}{{ if .Annotations.loki_link }}<a href="{{ .Annotations.loki_link }}">View Logs in Loki</a>{{ end }}{{ end }}'
slack_configs:
- api_url: 'https://hooks.slack.com/services/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'
channel: '#video-cluster-prd-alerts'
send_resolved: true
title: '[PRODUCTION] [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ if .Annotations.loki_link }}
*Logs:* <{{ .Annotations.loki_link }}|View in Loki>
{{ end }}
{{ end }}
*Source:* <{{ .ExternalURL }}|Alertmanager>
grafana:
persistence:
enabled: true
storageClassName: linode-block-storage-retain
size: 30Gi
ingress:
enabled: true
pathType: Prefix
annotations:
kubernetes.io/ingress.class: nginx
#nginx.ingress.kubernetes.io/rewrite-target: /$2
cert-manager.io/cluster-issuer: letsencrypt-nginx-production
nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
#nginx.ingress.kubernetes.io/auth-type: basic
#nginx.ingress.kubernetes.io/auth-secret: monitoring-basic-auth
#nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
hosts:
- monitoring.video.jamkazam.com
path: /grafana
tls:
- secretName: monitoring
hosts:
- monitoring.video.jamkazam.com
grafana.ini:
server:
domain: monitoring.video.jamkazam.com
root_url: "%(protocol)s://%(domain)s/grafana/"
enable_gzip: "true"
serve_from_sub_path: true
sidecar:
dashboards:
enabled: true
label: grafana_dashboard
searchNamespace: ALL
additionalDataSources:
- name: Loki
type: loki
uid: loki
url: http://loki.loki.svc:3100
access: proxy
# Disable control plane metrics
kubeEtcd:
enabled: false
kubeControllerManager:
enabled: false
kubeScheduler:
enabled: false
kubelet:
serviceMonitor:
trackTimestampsStaleness: false