Preparing for webrtc-be crash alerts

This commit is contained in:
Seth Call 2025-12-13 14:53:22 -06:00
parent 7c39106dfa
commit 5b502f93b4
6 changed files with 128 additions and 4 deletions

19
k8s/monitoring/README.md Normal file
View File

@ -0,0 +1,19 @@
# Monitoring and Alerting
## Slack Webhook Configuration
The Slack notifications use a specific Incoming Webhook URL structure:
`https://hooks.slack.com/services/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`
These ID components represent:
- **T0L5RA3E0**: Slack Workspace ID (e.g., JamKazam)
- **B01SM8RC346**: Bot/App Configuration ID (unique to the specific "Incoming Webhook" integration created in the Slack app management)
- **XDDOrcPE7eAXJPMCvc5FxIva**: The Secret Token for authentication
### Updating the Webhook
If you need to change the channel or regenerate the URL:
1. Go to [Slack App Management](https://api.slack.com/apps).
2. Select the relevant App (e.g., "Monitoring" or "Incoming Webhooks").
3. Navigate to **Incoming Webhooks**.
4. Generate a new Webhook URL for the desired channel.
5. Update the URL in `values-production.yaml` and `values-staging.yaml`.

View File

@ -150,13 +150,54 @@ kube-prometheus-stack:
routes:
- match:
alertname: WebrtcBeCrashed
receiver: 'email-alerts'
receiver: 'email-and-slack-notifications'
- match:
alertname: WebrtcBeError
receiver: 'email-and-slack-notifications'
receivers:
- name: 'null'
- name: 'email-alerts'
email_configs:
- to: 'alerts@jamkazam.com'
send_resolved: true
- name: 'slack-notifications'
slack_configs:
- api_url: 'https://hooks.slack.com/services/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'
channel: '#monitoring-alerts'
send_resolved: true
title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}
- name: 'email-and-slack-notifications'
email_configs:
- to: 'alerts@jamkazam.com'
send_resolved: true
headers:
Subject: '[PRODUCTION] {{ .Status | toUpper }} - {{ range .Alerts }}{{ .Annotations.summary }} {{ end }}'
html: '{{ template "email.default.html" . }}<br/><hr/>View in <a href="{{ .ExternalURL }}">Alertmanager</a><br/>{{ range .Alerts }}{{ if .Annotations.loki_link }}<a href="{{ .Annotations.loki_link }}">View Logs in Loki</a>{{ end }}{{ end }}'
slack_configs:
- api_url: 'https://hooks.slack.com/services/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'
channel: '#monitoring-alerts'
send_resolved: true
title: '[PRODUCTION] [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ if .Annotations.loki_link }}
*Logs:* <{{ .Annotations.loki_link }}|View in Loki>
{{ end }}
{{ end }}
*Source:* <{{ .ExternalURL }}|Alertmanager>
grafana:
persistence:
enabled: true

View File

@ -150,13 +150,54 @@ kube-prometheus-stack:
routes:
- match:
alertname: WebrtcBeCrashed
receiver: 'email-alerts'
receiver: 'email-and-slack-notifications'
- match:
alertname: WebrtcBeError
receiver: 'email-and-slack-notifications'
receivers:
- name: 'null'
- name: 'email-alerts'
email_configs:
- to: 'alerts@jamkazam.com'
send_resolved: true
- name: 'slack-notifications'
slack_configs:
- api_url: 'https://hooks.slack.com/services/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'
channel: '#monitoring-alerts'
send_resolved: true
title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}
- name: 'email-and-slack-notifications'
email_configs:
- to: 'alerts@jamkazam.com'
send_resolved: true
headers:
Subject: '[STAGING] {{ .Status | toUpper }} - {{ range .Alerts }}{{ .Annotations.summary }} {{ end }}'
html: '{{ template "email.default.html" . }}<br/><hr/>View in <a href="{{ .ExternalURL }}">Alertmanager</a><br/>{{ range .Alerts }}{{ if .Annotations.loki_link }}<a href="{{ .Annotations.loki_link }}">View Logs in Loki</a>{{ end }}{{ end }}'
slack_configs:
- api_url: 'https://hooks.slack.com/services/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'
channel: '#monitoring-alerts'
send_resolved: true
title: '[STAGING] [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Event Notification'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ if .Annotations.loki_link }}
*Logs:* <{{ .Annotations.loki_link }}|View in Loki>
{{ end }}
{{ end }}
*Source:* <{{ .ExternalURL }}|Alertmanager>
grafana:
persistence:
enabled: true

View File

@ -27,3 +27,5 @@ spec:
annotations:
summary: "webrtc-be crashed"
description: "The webrtc-be pod has crashed. Please check the logs."
loki_link: >-
{{ .Values.grafana.externalUrl }}/explore?orgId=1&left=["now-1h","now","Loki",{"expr":"{container=\"webrtc-be\", namespace=\"webrtc-be\"}"}]

8
scripts/loki-port-forward.sh Executable file
View File

@ -0,0 +1,8 @@
#!/bin/bash
# Port forward Loki service to localhost:3101
# Usage: ./scripts/loki-port-forward.sh
# Keep this running in a separate terminal.
# Once running, you can use ./scripts/loki-query.sh to inspect logs.
echo "Port forwarding Loki to http://localhost:3101..."
kubectl -n loki port-forward svc/loki 3101:3100

13
scripts/loki-query.sh Executable file
View File

@ -0,0 +1,13 @@
#!/bin/bash
# Query Loki for recent logs of a specific pod regex
# Usage: ./scripts/loki-query.sh [pod_regex]
# Example: ./scripts/loki-query.sh "webrtc-be-.*"
POD_REGEX="${1:-webrtc-be-.*}"
echo "Querying Loki for pod regex: ${POD_REGEX}"
echo "Checking labels (namespace, cluster, etc)..."
curl -G -s "http://localhost:3101/loki/api/v1/query_range" \
--data-urlencode "query={pod=~\"${POD_REGEX}\"}" \
--data-urlencode "limit=1" | jq '.data.result[0].stream'