298 lines
10 KiB
YAML
298 lines
10 KiB
YAML
# Helm chart values for Prometheus Operator with HTTPS and basic auth
|
|
# Explicitly enable RBAC resource creation
|
|
rbac:
|
|
create: true
|
|
|
|
cpuThresholdMedia: 65
|
|
cpuThresholdOther: 80
|
|
|
|
|
|
kube-prometheus-stack:
|
|
prometheus:
|
|
prometheusSpec:
|
|
nodeSelector:
|
|
workload: infra
|
|
grafana:
|
|
nodeSelector:
|
|
workload: infra
|
|
alertmanager:
|
|
alertmanagerSpec:
|
|
nodeSelector:
|
|
workload: infra
|
|
crds:
|
|
enabled: false
|
|
|
|
# Disable the default ServiceMonitor configuration paths to prevent duplicates
|
|
prometheus-node-exporter:
|
|
serviceMonitor:
|
|
enabled: false
|
|
nodeExporter:
|
|
serviceMonitor:
|
|
enabled: false
|
|
|
|
prometheus:
|
|
ingress:
|
|
enabled: true
|
|
pathType: Prefix
|
|
annotations:
|
|
kubernetes.io/ingress.class: nginx
|
|
#nginx.ingress.kubernetes.io/rewrite-target: /$2
|
|
cert-manager.io/cluster-issuer: letsencrypt-nginx-production
|
|
nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
|
|
nginx.ingress.kubernetes.io/auth-type: basic
|
|
nginx.ingress.kubernetes.io/auth-secret: monitoring-basic-auth
|
|
nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
|
|
hosts:
|
|
- monitoring.video.jamkazam.com
|
|
paths:
|
|
- /prometheus
|
|
tls:
|
|
- secretName: monitoring
|
|
hosts:
|
|
- monitoring.video.jamkazam.com
|
|
prometheusSpec:
|
|
retention: 60d
|
|
retentionSize: 20GB
|
|
routePrefix: /prometheus
|
|
externalUrl: https://monitoring.video.jamkazam.com/prometheus
|
|
storageSpec:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
storageClassName: linode-block-storage-retain
|
|
resources:
|
|
requests:
|
|
storage: 30Gi
|
|
# 2. !!! CRUCIAL: Ensure the default ServiceMonitor is ignored !!!
|
|
# This prevents duplicate metrics by telling Prometheus to ignore the default SM.
|
|
serviceMonitorSelector:
|
|
matchExpressions:
|
|
# Exclude the default node-exporter ServiceMonitor
|
|
- key: app.kubernetes.io/name
|
|
operator: NotIn
|
|
values:
|
|
# Use the label identified above
|
|
- prometheus-node-exporter
|
|
serviceMonitorNamespaceSelector:
|
|
matchExpressions:
|
|
- key: kubernetes.io/metadata.name
|
|
operator: In
|
|
values:
|
|
- monitoring # Its own namespace
|
|
- webrtc-be # Your app's namespace
|
|
# Enable discovery of PrometheusRules in these namespaces
|
|
ruleNamespaceSelector: {} # Match all namespaces (avoids dependency on namespace labels)
|
|
ruleSelector:
|
|
matchExpressions: [] # Match all rules in selected namespaces
|
|
# Add the manual scrape configuration
|
|
additionalScrapeConfigs:
|
|
- job_name: 'node-exporter'
|
|
kubernetes_sd_configs:
|
|
- role: node
|
|
|
|
relabel_configs:
|
|
# 1. Pull all node labels (including workload)
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_node_label_(.+)
|
|
|
|
# 2. Target the node-exporter port (9100) on the node's IP.
|
|
# role: node discovery defaults to the Kubelet port (10250).
|
|
- source_labels: [__address__]
|
|
regex: '(.*):10250'
|
|
replacement: '${1}:9100'
|
|
target_label: __address__
|
|
|
|
# 3. Set standard labels for dashboard and alert compatibility
|
|
- source_labels: [__meta_kubernetes_node_name]
|
|
target_label: instance
|
|
action: replace
|
|
- source_labels: [__meta_kubernetes_node_name]
|
|
target_label: node
|
|
action: replace
|
|
- target_label: namespace
|
|
replacement: monitoring
|
|
- target_label: pod
|
|
replacement: node-exporter-discovery
|
|
alertmanager:
|
|
ingress:
|
|
enabled: true
|
|
pathType: Prefix
|
|
annotations:
|
|
kubernetes.io/ingress.class: nginx
|
|
#nginx.ingress.kubernetes.io/rewrite-target: /$2
|
|
cert-manager.io/cluster-issuer: letsencrypt-nginx-production
|
|
nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
|
|
nginx.ingress.kubernetes.io/auth-type: basic
|
|
nginx.ingress.kubernetes.io/auth-secret: monitoring-basic-auth
|
|
nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
|
|
hosts:
|
|
- monitoring.video.jamkazam.com
|
|
paths:
|
|
- /alertmanager
|
|
tls:
|
|
- secretName: monitoring
|
|
hosts:
|
|
- monitoring.video.jamkazam.com
|
|
alertmanagerSpec:
|
|
routePrefix: /alertmanager
|
|
externalUrl: https://monitoring.video.jamkazam.com/alertmanager
|
|
storage:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
storageClassName: linode-block-storage-retain
|
|
resources:
|
|
requests:
|
|
storage: 30Gi
|
|
|
|
config:
|
|
global:
|
|
resolve_timeout: 5m
|
|
smtp_smarthost: 'email-smtp.us-east-1.amazonaws.com:587'
|
|
smtp_from: 'support@jamkazam.com'
|
|
smtp_auth_username: 'AKIA2SXEHOQFM326T4WJ'
|
|
smtp_auth_password: 'BM6zKJUOWSc4XF+1dXZZlqAkbybGX+KbY+YciI7PIcsn'
|
|
smtp_require_tls: true
|
|
route:
|
|
group_by: ['job']
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 12h
|
|
receiver: 'null'
|
|
routes:
|
|
- match:
|
|
alertname: MediaNodeHighCPU
|
|
receiver: 'slack-notifications'
|
|
- match:
|
|
alertname: NodeMissingWorkloadLabel
|
|
receiver: 'slack-notifications'
|
|
- match:
|
|
alertname: NodeHighCPU
|
|
receiver: 'slack-notifications'
|
|
- match:
|
|
alertname: WebrtcBeCrashed
|
|
receiver: 'email-and-slack-notifications'
|
|
- match:
|
|
alertname: WebrtcBeDown
|
|
receiver: 'email-and-slack-notifications'
|
|
- match:
|
|
alertname: WebrtcBeError
|
|
receiver: 'email-and-slack-notifications'
|
|
- match:
|
|
alertname: PodOOMKilled
|
|
receiver: 'slack-notifications-oom'
|
|
- match:
|
|
alertname: PodCrashLoopBackOff
|
|
receiver: 'slack-notifications'
|
|
receivers:
|
|
- name: 'null'
|
|
- name: 'email-alerts'
|
|
email_configs:
|
|
- to: 'alerts@jamkazam.com'
|
|
send_resolved: true
|
|
- name: 'slack-notifications'
|
|
slack_configs:
|
|
- api_url: 'https://hooks.slack.com/services/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'
|
|
channel: '#video-cluster-prd-alerts'
|
|
send_resolved: true
|
|
title: '[PRODUCTION] [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification'
|
|
text: >-
|
|
{{ range .Alerts }}
|
|
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
|
|
*Description:* {{ .Annotations.description }}
|
|
*Details:*
|
|
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
|
|
{{ end }}
|
|
{{ end }}
|
|
- name: 'slack-notifications-oom'
|
|
slack_configs:
|
|
- api_url: 'https://hooks.slack.com/services/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'
|
|
channel: '#video-cluster-prd-alerts'
|
|
send_resolved: false
|
|
title: '[PRODUCTION] [OOM KILLED] Monitoring Event Notification'
|
|
text: >-
|
|
{{ range .Alerts }}
|
|
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
|
|
*Description:* {{ .Annotations.description }}
|
|
*Details:*
|
|
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
|
|
{{ end }}
|
|
{{ end }}
|
|
- name: 'email-and-slack-notifications'
|
|
email_configs:
|
|
- to: 'alerts@jamkazam.com'
|
|
send_resolved: true
|
|
headers:
|
|
Subject: '[PRODUCTION] {{ .Status | toUpper }} - {{ range .Alerts }}{{ .Annotations.summary }} {{ end }}'
|
|
html: '{{ template "email.default.html" . }}<br/><hr/>View in <a href="{{ .ExternalURL }}">Alertmanager</a><br/>{{ range .Alerts }}{{ if .Annotations.loki_link }}<a href="{{ .Annotations.loki_link }}&from={{ .StartsAt.Unix | add -60 | mul 1000 }}&to={{ .StartsAt.Unix | add 60 | mul 1000 }}">View Logs in Loki</a>{{ end }}{{ end }}'
|
|
slack_configs:
|
|
- api_url: 'https://hooks.slack.com/services/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'
|
|
channel: '#video-cluster-prd-alerts'
|
|
send_resolved: true
|
|
title: '[PRODUCTION] [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification'
|
|
text: >-
|
|
{{ range .Alerts }}
|
|
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
|
|
*Description:* {{ .Annotations.description }}
|
|
*Details:*
|
|
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
|
|
{{ end }}
|
|
{{ if .Annotations.loki_link }}
|
|
*Logs:* <{{ .Annotations.loki_link }}&from={{ .StartsAt.Unix | add -60 | mul 1000 }}&to={{ .StartsAt.Unix | add 60 | mul 1000 }}|View in Loki>
|
|
{{ end }}
|
|
{{ end }}
|
|
*Source:* <{{ .ExternalURL }}|Alertmanager>
|
|
grafana:
|
|
persistence:
|
|
enabled: true
|
|
storageClassName: linode-block-storage-retain
|
|
size: 30Gi
|
|
ingress:
|
|
enabled: true
|
|
pathType: Prefix
|
|
annotations:
|
|
kubernetes.io/ingress.class: nginx
|
|
#nginx.ingress.kubernetes.io/rewrite-target: /$2
|
|
cert-manager.io/cluster-issuer: letsencrypt-nginx-production
|
|
nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
|
|
#nginx.ingress.kubernetes.io/auth-type: basic
|
|
#nginx.ingress.kubernetes.io/auth-secret: monitoring-basic-auth
|
|
#nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
|
|
hosts:
|
|
- monitoring.video.jamkazam.com
|
|
path: /grafana
|
|
tls:
|
|
- secretName: monitoring
|
|
hosts:
|
|
- monitoring.video.jamkazam.com
|
|
grafana.ini:
|
|
server:
|
|
domain: monitoring.video.jamkazam.com
|
|
root_url: "%(protocol)s://%(domain)s/grafana/"
|
|
enable_gzip: "true"
|
|
serve_from_sub_path: true
|
|
sidecar:
|
|
dashboards:
|
|
enabled: true
|
|
label: grafana_dashboard
|
|
searchNamespace: ALL
|
|
additionalDataSources:
|
|
- name: Loki
|
|
type: loki
|
|
uid: loki
|
|
url: http://loki.loki.svc:3100
|
|
access: proxy
|
|
|
|
# Disable control plane metrics
|
|
kubeEtcd:
|
|
enabled: false
|
|
|
|
kubeControllerManager:
|
|
enabled: false
|
|
|
|
kubeScheduler:
|
|
enabled: false
|
|
|
|
kubelet:
|
|
serviceMonitor:
|
|
trackTimestampsStaleness: false
|
|
|