Probers 1st attempt

This commit is contained in:
Seth Call 2025-11-18 20:21:03 -06:00
parent 8c271b9b7b
commit 15139dec05
11 changed files with 324 additions and 0 deletions

View File

@ -0,0 +1,23 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: probers
spec:
destination:
name: ''
namespace: probers
server: 'https://kubernetes.default.svc'
source:
helm:
valueFiles:
- values-{{ .Values.environment }}.yaml
path: k8s/probers
repoURL: 'git@bitbucket.org:jamkazam/video-iac.git'
targetRevision: {{ .Values.gitBranch }}
project: default
syncPolicy:
syncOptions:
- CreateNamespace=true
automated:
prune: true
selfHeal: true

6
k8s/probers/Chart.yaml Normal file
View File

@ -0,0 +1,6 @@
apiVersion: v2
name: probers
description: A Helm chart for various propers; in particular webrtc_be probing
type: application
version: 0.1.0
appVersion: "1.0.0"

View File

@ -0,0 +1,32 @@
{{/*
Create a default fully qualified app name.
*/}}
{{- define "probers.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "probers.labels" -}}
helm.sh/chart: {{ printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Sanitize Kubernetes label keys for Prometheus (kube-state-metrics compatibility).
Replaces characters like '.', '/', and '-' with '_'.
Example: app.kubernetes.io/name -> app_kubernetes_io_name
*/}}
{{- define "probers.sanitizePrometheusLabel" -}}
{{- . | replace "." "_" | replace "/" "_" | replace "-" "_" -}}
{{- end -}}

View File

@ -0,0 +1,28 @@
{{- if .Values.e2eProber.enabled }}
apiVersion: batch/v1
kind: CronJob
metadata:
name: {{ include "probers.fullname" . }}-e2e
labels:
{{- include "probers.labels" . | nindent 4 }}
spec:
schedule: "{{ .Values.e2eProber.schedule }}"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: e2e-tester
image: "{{ .Values.e2eProber.image.repository }}:{{ .Values.e2eProber.image.tag }}"
imagePullPolicy: {{ .Values.e2eProber.image.pullPolicy }}
# Assuming the image's entrypoint executes the test suite (e.g., `npm test` or `pytest`)
env:
- name: BACKEND_URL
value: {{ .Values.e2eProber.backendUrl | quote }}
resources:
{{- toYaml .Values.e2eProber.resources | nindent 14 }}
{{- end }}

View File

@ -0,0 +1,77 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ include "probers.fullname" . }}-alerts
labels:
{{- include "probers.labels" . | nindent 4 }}
{{- toYaml .Values.monitoring.labels | nindent 4 }}
spec:
groups:
- name: jkvideo.alerts
rules:
{{- if .Values.jkvideo.cpuAlerts.enabled }}
# Alert: High CPU Utilization
- alert: JkvideoHighCPU
# This query calculates the CPU usage/limit ratio per container and then uses vector matching
# (* on(...) group_left()) to filter the results to only the pods matching the specific labels defined in values.yaml.
expr: |
(
# 1. Calculate CPU usage rate per container (cAdvisor metrics)
sum by (pod, namespace, container) (
rate(container_cpu_usage_seconds_total{
namespace="{{ .Values.jkvideo.namespace }}",
container!="", image!=""
}[5m])
)
/
# 2. Get CPU limits per container (kube-state-metrics)
sum by (pod, namespace, container) (
kube_pod_container_resource_limits{
namespace="{{ .Values.jkvideo.namespace }}",
resource="cpu"
}
)
)
# 3. Filter by Kubernetes labels using vector matching against kube_pod_labels
* on (pod, namespace) group_left()
(
kube_pod_labels{
namespace="{{ .Values.jkvideo.namespace }}"
{{- range $key, $value := .Values.jkvideo.podLabels }},
label_{{ include "probers.sanitizePrometheusLabel" $key }}="{{ $value }}"
{{- end }}
}
) > {{ .Values.jkvideo.cpuAlerts.threshold }}
for: {{ .Values.jkvideo.cpuAlerts.for }}
labels:
severity: warning
annotations:
summary: "Jkvideo Pod {{ "{{" }} $labels.pod {{ "}}" }} High CPU Utilization"
description: "CPU utilization for container {{ "{{" }} $labels.container {{ "}}" }} in pod {{ "{{" }} $labels.pod {{ "}}" }} has exceeded {{ .Values.jkvideo.cpuAlerts.threshold | mul 100 }}% of the limit."
{{- end }}
{{- if .Values.turnProber.enabled }}
# Alert: TURN Prober Failure
- alert: TurnProberJobFailed
# Alert if the CronJob fails (metric from kube-state-metrics).
# The regex matches the job name generated by the CronJob controller (which appends a suffix).
expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-turn-.*"} > 0
for: 0m # Alert immediately
labels:
severity: critical
annotations:
summary: "TURN Server Probe Failed"
description: "The CronJob testing the TURN server failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})."
{{- end }}
{{- if .Values.e2eProber.enabled }}
# Alert: E2E Prober Failure
- alert: E2EProberJobFailed
expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-e2e-.*"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Mediasoup E2E Session Probe Failed"
description: "The CronJob simulating a 2-party session failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})."
{{- end }}

View File

@ -0,0 +1,52 @@
{{- if .Values.turnProber.enabled }}
apiVersion: batch/v1
kind: CronJob
metadata:
name: {{ include "probers.fullname" . }}-turn
labels:
{{- include "probers.labels" . | nindent 4 }}
spec:
schedule: "{{ .Values.turnProber.schedule }}"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3
jobTemplate:
spec:
template:
spec:
imagePullSecrets:
{{ toYaml .Values.turnProber.imagePullSecrets | indent 12 }}
restartPolicy: OnFailure
containers:
- name: turnutils-client
image: "{{ .Values.turnProber.image.repository }}:{{ .Values.turnProber.image.tag }}"
imagePullPolicy: {{ .Values.turnProber.image.pullPolicy }}
command:
- /bin/sh
- -c
- |
set -e
echo "Probing TURN server {{ .Values.turnProber.target }} via custom prober"
# Run the custom client
turnutils_uclient_ars -c -n 2000 -v -H 1 -R 1 \
-u {{ .Values.turnProber.username }} \
-w {{ .Values.turnProber.password }} \
-p {{ .Values.turnProber.port }} \
{{ .Values.turnProber.target }}
# ORIGINAL PROBE (coturn stock invocation)
# Run the client
# -y: client-to-client mode (self-test, verifies relay functionality)
# -n 5: Send 5 messages
# -v: Verbose (useful for debugging logs)
# Exits 0 on success, non-zero on failure
#turnutils_uclient -v $PROTOCOL_FLAG -n 5 -y \
# -u {{ .Values.turnProber.username }} \
# -w {{ .Values.turnProber.password }} \
# {{ .Values.turnProber.target }}
echo "TURN probe successful."
{{- end }}

View File

@ -0,0 +1,20 @@
# Production specific values
environment: "production"
jkvideo:
# In production, we should be more conservative with alerts
cpuAlerts:
threshold: 0.9
for: "10m"
turnProber:
target: "turn.video.jamkazam.com"
# Reminder: In a real environment, these should be managed via Kubernetes secrets
# username: "prod-user"
# password: "use-a-secret"
e2eProber:
enabled: true # Enabling for production
backendUrl: "https://webrtc-be.video.jamkazam.com"
image:
tag: "stable-v1.2.3" # Example of a stable tag for production

View File

@ -0,0 +1,20 @@
# Staging specific values
environment: "staging"
jkvideo:
# In staging, we can be more aggressive with alerts
cpuAlerts:
threshold: 0.75
for: "2m"
turnProber:
target: "turn.staging.video.jamkazam.com"
# Reminder: In a real environment, these should be managed via Kubernetes secrets
# username: "staging-user"
# password: "use-a-secret"
e2eProber:
enabled: true # Enabling for staging
backendUrl: "https://webrtc-be.staging.video.jamkazam.com"
image:
tag: "staging-latest"

52
k8s/probers/values.yaml Normal file
View File

@ -0,0 +1,52 @@
# Default environment
environment: "staging"
# Common configuration for monitoring integration
monitoring:
# Labels to apply to PrometheusRules so Prometheus Operator can discover them.
labels:
release: prometheus-stack
# jkvideo Backend Details
jkvideo:
namespace: "webrtc-be"
podLabels:
app.kubernetes.io/name: "webrtc-be"
component: "worker"
cpuAlerts:
enabled: true
threshold: 0.7
for: "5m"
# TURN Server Probing Configuration
turnProber:
enabled: true
schedule: "*/5 * * * *"
image:
repository: gcr.io/tough-craft-276813/coturn
tag: "latest"
pullPolicy: Always
imagePullSecrets:
- name: gcr-json-key
target: "turn.staging.video.jamkazam.com"
port: 3478
username: "smoketest"
password: "foolishcharmer"
protocol: "udp"
# E2E Session Simulation Configuration
e2eProber:
enabled: false
schedule: "*/15 * * * *"
backendUrl: "https://webrtc-be.staging.video.jamkazam.com"
image:
repository: "your-registry/mediasoup-e2e-tester"
tag: "latest"
pullPolicy: IfNotPresent
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1000m
memory: 2Gi

4
scripts/helm-update-probers Executable file
View File

@ -0,0 +1,4 @@
# run in k8s/monitoring folder
helm upgrade --install probers . --values values-production.yaml
# helm upgrade webrtc-be . --namespace webrtc-be -f values-staging.yaml

View File

@ -0,0 +1,10 @@
# run in k8s/monitoring folder
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
pushd $SCRIPT_DIR/../k8s/probers > /dev/null
helm upgrade --install probers . --values values-staging.yaml
popd
# helm upgrade webrtc-be . --namespace webrtc-be -f values-staging.yaml