Probers 1st attempt
This commit is contained in:
parent
8c271b9b7b
commit
15139dec05
|
|
@ -0,0 +1,23 @@
|
|||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: probers
|
||||
spec:
|
||||
destination:
|
||||
name: ''
|
||||
namespace: probers
|
||||
server: 'https://kubernetes.default.svc'
|
||||
source:
|
||||
helm:
|
||||
valueFiles:
|
||||
- values-{{ .Values.environment }}.yaml
|
||||
path: k8s/probers
|
||||
repoURL: 'git@bitbucket.org:jamkazam/video-iac.git'
|
||||
targetRevision: {{ .Values.gitBranch }}
|
||||
project: default
|
||||
syncPolicy:
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
apiVersion: v2
|
||||
name: probers
|
||||
description: A Helm chart for various propers; in particular webrtc_be probing
|
||||
type: application
|
||||
version: 0.1.0
|
||||
appVersion: "1.0.0"
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
*/}}
|
||||
{{- define "probers.fullname" -}}
|
||||
{{- if .Values.fullnameOverride }}
|
||||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride }}
|
||||
{{- if contains $name .Release.Name }}
|
||||
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Common labels
|
||||
*/}}
|
||||
{{- define "probers.labels" -}}
|
||||
helm.sh/chart: {{ printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Sanitize Kubernetes label keys for Prometheus (kube-state-metrics compatibility).
|
||||
Replaces characters like '.', '/', and '-' with '_'.
|
||||
Example: app.kubernetes.io/name -> app_kubernetes_io_name
|
||||
*/}}
|
||||
{{- define "probers.sanitizePrometheusLabel" -}}
|
||||
{{- . | replace "." "_" | replace "/" "_" | replace "-" "_" -}}
|
||||
{{- end -}}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
{{- if .Values.e2eProber.enabled }}
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: {{ include "probers.fullname" . }}-e2e
|
||||
labels:
|
||||
{{- include "probers.labels" . | nindent 4 }}
|
||||
spec:
|
||||
schedule: "{{ .Values.e2eProber.schedule }}"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: e2e-tester
|
||||
image: "{{ .Values.e2eProber.image.repository }}:{{ .Values.e2eProber.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.e2eProber.image.pullPolicy }}
|
||||
# Assuming the image's entrypoint executes the test suite (e.g., `npm test` or `pytest`)
|
||||
env:
|
||||
- name: BACKEND_URL
|
||||
value: {{ .Values.e2eProber.backendUrl | quote }}
|
||||
resources:
|
||||
{{- toYaml .Values.e2eProber.resources | nindent 14 }}
|
||||
{{- end }}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ include "probers.fullname" . }}-alerts
|
||||
labels:
|
||||
{{- include "probers.labels" . | nindent 4 }}
|
||||
{{- toYaml .Values.monitoring.labels | nindent 4 }}
|
||||
spec:
|
||||
groups:
|
||||
- name: jkvideo.alerts
|
||||
rules:
|
||||
{{- if .Values.jkvideo.cpuAlerts.enabled }}
|
||||
# Alert: High CPU Utilization
|
||||
- alert: JkvideoHighCPU
|
||||
# This query calculates the CPU usage/limit ratio per container and then uses vector matching
|
||||
# (* on(...) group_left()) to filter the results to only the pods matching the specific labels defined in values.yaml.
|
||||
expr: |
|
||||
(
|
||||
# 1. Calculate CPU usage rate per container (cAdvisor metrics)
|
||||
sum by (pod, namespace, container) (
|
||||
rate(container_cpu_usage_seconds_total{
|
||||
namespace="{{ .Values.jkvideo.namespace }}",
|
||||
container!="", image!=""
|
||||
}[5m])
|
||||
)
|
||||
/
|
||||
# 2. Get CPU limits per container (kube-state-metrics)
|
||||
sum by (pod, namespace, container) (
|
||||
kube_pod_container_resource_limits{
|
||||
namespace="{{ .Values.jkvideo.namespace }}",
|
||||
resource="cpu"
|
||||
}
|
||||
)
|
||||
)
|
||||
# 3. Filter by Kubernetes labels using vector matching against kube_pod_labels
|
||||
* on (pod, namespace) group_left()
|
||||
(
|
||||
kube_pod_labels{
|
||||
namespace="{{ .Values.jkvideo.namespace }}"
|
||||
{{- range $key, $value := .Values.jkvideo.podLabels }},
|
||||
label_{{ include "probers.sanitizePrometheusLabel" $key }}="{{ $value }}"
|
||||
{{- end }}
|
||||
}
|
||||
) > {{ .Values.jkvideo.cpuAlerts.threshold }}
|
||||
for: {{ .Values.jkvideo.cpuAlerts.for }}
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Jkvideo Pod {{ "{{" }} $labels.pod {{ "}}" }} High CPU Utilization"
|
||||
description: "CPU utilization for container {{ "{{" }} $labels.container {{ "}}" }} in pod {{ "{{" }} $labels.pod {{ "}}" }} has exceeded {{ .Values.jkvideo.cpuAlerts.threshold | mul 100 }}% of the limit."
|
||||
{{- end }}
|
||||
|
||||
{{- if .Values.turnProber.enabled }}
|
||||
# Alert: TURN Prober Failure
|
||||
- alert: TurnProberJobFailed
|
||||
# Alert if the CronJob fails (metric from kube-state-metrics).
|
||||
# The regex matches the job name generated by the CronJob controller (which appends a suffix).
|
||||
expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-turn-.*"} > 0
|
||||
for: 0m # Alert immediately
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "TURN Server Probe Failed"
|
||||
description: "The CronJob testing the TURN server failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})."
|
||||
{{- end }}
|
||||
|
||||
{{- if .Values.e2eProber.enabled }}
|
||||
# Alert: E2E Prober Failure
|
||||
- alert: E2EProberJobFailed
|
||||
expr: kube_job_status_failed{job_name=~"^{{ include "probers.fullname" . }}-e2e-.*"} > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Mediasoup E2E Session Probe Failed"
|
||||
description: "The CronJob simulating a 2-party session failed. Check the logs of the failed job ({{ "{{" }} $labels.job_name {{ "}}" }})."
|
||||
{{- end }}
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
{{- if .Values.turnProber.enabled }}
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: {{ include "probers.fullname" . }}-turn
|
||||
labels:
|
||||
{{- include "probers.labels" . | nindent 4 }}
|
||||
spec:
|
||||
schedule: "{{ .Values.turnProber.schedule }}"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
{{ toYaml .Values.turnProber.imagePullSecrets | indent 12 }}
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: turnutils-client
|
||||
image: "{{ .Values.turnProber.image.repository }}:{{ .Values.turnProber.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.turnProber.image.pullPolicy }}
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
set -e
|
||||
echo "Probing TURN server {{ .Values.turnProber.target }} via custom prober"
|
||||
|
||||
# Run the custom client
|
||||
turnutils_uclient_ars -c -n 2000 -v -H 1 -R 1 \
|
||||
-u {{ .Values.turnProber.username }} \
|
||||
-w {{ .Values.turnProber.password }} \
|
||||
-p {{ .Values.turnProber.port }} \
|
||||
{{ .Values.turnProber.target }}
|
||||
|
||||
|
||||
# ORIGINAL PROBE (coturn stock invocation)
|
||||
# Run the client
|
||||
# -y: client-to-client mode (self-test, verifies relay functionality)
|
||||
# -n 5: Send 5 messages
|
||||
# -v: Verbose (useful for debugging logs)
|
||||
# Exits 0 on success, non-zero on failure
|
||||
#turnutils_uclient -v $PROTOCOL_FLAG -n 5 -y \
|
||||
# -u {{ .Values.turnProber.username }} \
|
||||
# -w {{ .Values.turnProber.password }} \
|
||||
# {{ .Values.turnProber.target }}
|
||||
|
||||
|
||||
echo "TURN probe successful."
|
||||
{{- end }}
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
# Production specific values
|
||||
environment: "production"
|
||||
|
||||
jkvideo:
|
||||
# In production, we should be more conservative with alerts
|
||||
cpuAlerts:
|
||||
threshold: 0.9
|
||||
for: "10m"
|
||||
|
||||
turnProber:
|
||||
target: "turn.video.jamkazam.com"
|
||||
# Reminder: In a real environment, these should be managed via Kubernetes secrets
|
||||
# username: "prod-user"
|
||||
# password: "use-a-secret"
|
||||
|
||||
e2eProber:
|
||||
enabled: true # Enabling for production
|
||||
backendUrl: "https://webrtc-be.video.jamkazam.com"
|
||||
image:
|
||||
tag: "stable-v1.2.3" # Example of a stable tag for production
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
# Staging specific values
|
||||
environment: "staging"
|
||||
|
||||
jkvideo:
|
||||
# In staging, we can be more aggressive with alerts
|
||||
cpuAlerts:
|
||||
threshold: 0.75
|
||||
for: "2m"
|
||||
|
||||
turnProber:
|
||||
target: "turn.staging.video.jamkazam.com"
|
||||
# Reminder: In a real environment, these should be managed via Kubernetes secrets
|
||||
# username: "staging-user"
|
||||
# password: "use-a-secret"
|
||||
|
||||
e2eProber:
|
||||
enabled: true # Enabling for staging
|
||||
backendUrl: "https://webrtc-be.staging.video.jamkazam.com"
|
||||
image:
|
||||
tag: "staging-latest"
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
# Default environment
|
||||
environment: "staging"
|
||||
|
||||
# Common configuration for monitoring integration
|
||||
monitoring:
|
||||
# Labels to apply to PrometheusRules so Prometheus Operator can discover them.
|
||||
labels:
|
||||
release: prometheus-stack
|
||||
|
||||
# jkvideo Backend Details
|
||||
jkvideo:
|
||||
namespace: "webrtc-be"
|
||||
podLabels:
|
||||
app.kubernetes.io/name: "webrtc-be"
|
||||
component: "worker"
|
||||
cpuAlerts:
|
||||
enabled: true
|
||||
threshold: 0.7
|
||||
for: "5m"
|
||||
|
||||
# TURN Server Probing Configuration
|
||||
turnProber:
|
||||
enabled: true
|
||||
schedule: "*/5 * * * *"
|
||||
image:
|
||||
repository: gcr.io/tough-craft-276813/coturn
|
||||
tag: "latest"
|
||||
pullPolicy: Always
|
||||
imagePullSecrets:
|
||||
- name: gcr-json-key
|
||||
target: "turn.staging.video.jamkazam.com"
|
||||
port: 3478
|
||||
username: "smoketest"
|
||||
password: "foolishcharmer"
|
||||
protocol: "udp"
|
||||
|
||||
# E2E Session Simulation Configuration
|
||||
e2eProber:
|
||||
enabled: false
|
||||
schedule: "*/15 * * * *"
|
||||
backendUrl: "https://webrtc-be.staging.video.jamkazam.com"
|
||||
image:
|
||||
repository: "your-registry/mediasoup-e2e-tester"
|
||||
tag: "latest"
|
||||
pullPolicy: IfNotPresent
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
# run in k8s/monitoring folder
|
||||
helm upgrade --install probers . --values values-production.yaml
|
||||
|
||||
# helm upgrade webrtc-be . --namespace webrtc-be -f values-staging.yaml
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
# run in k8s/monitoring folder
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
|
||||
pushd $SCRIPT_DIR/../k8s/probers > /dev/null
|
||||
|
||||
helm upgrade --install probers . --values values-staging.yaml
|
||||
|
||||
popd
|
||||
|
||||
# helm upgrade webrtc-be . --namespace webrtc-be -f values-staging.yaml
|
||||
Loading…
Reference in New Issue