diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 0000000..655d600 --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,24 @@ +# Kubernetes Configuration + +This directory contains Kubernetes manifests and configuration for the video infrastructure. + +## Managing CRDs + +The file `all-crds.yaml` contains all Custom Resource Definitions (CRDs) required by the monitoring stack (Prometheus Operator). + +### When to update CRDs + +You should regenerate `all-crds.yaml` by running `scripts/update-crds.sh` when: + +1. **Upgrading the `kube-prometheus-stack` Helm chart**: If you bump the chart version in `k8s/monitoring/Chart.yaml` and update the dependencies, you must also update the CRDs to match the new version. +2. **Missing CRD fields**: If you encounter errors like `field not declared in schema` during ArgoCD syncs, it likely means the installed CRDs are outdated. + +### How to update + +Run the update script from the repository root: + +```bash +./scripts/update-crds.sh +``` + +This script extracts the CRDs from the local `kube-prometheus-stack` chart package and concatenates them into `k8s/all-crds.yaml`. diff --git a/k8s/applications/templates/loki.yaml b/k8s/applications/templates/loki.yaml new file mode 100644 index 0000000..d4393a7 --- /dev/null +++ b/k8s/applications/templates/loki.yaml @@ -0,0 +1,28 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: loki +spec: + destination: + namespace: loki + server: 'https://kubernetes.default.svc' + source: + helm: + valueFiles: + - values.yaml + path: k8s/loki + repoURL: 'git@bitbucket.org:jamkazam/video-iac.git' + targetRevision: {{ .Values.gitBranch }} + project: default + syncPolicy: + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + automated: + prune: true + retry: + limit: 5 + backoff: + duration: 5s + factor: 2 + maxDuration: 3m diff --git a/k8s/applications/templates/promtail.yaml b/k8s/applications/templates/promtail.yaml new file mode 100644 index 0000000..bb5b455 --- /dev/null +++ b/k8s/applications/templates/promtail.yaml @@ -0,0 +1,28 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: promtail +spec: + destination: + namespace: loki + server: 'https://kubernetes.default.svc' + source: + helm: + valueFiles: + - values.yaml + path: k8s/promtail + repoURL: 'git@bitbucket.org:jamkazam/video-iac.git' + targetRevision: {{ .Values.gitBranch }} + project: default + syncPolicy: + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + automated: + prune: true + retry: + limit: 5 + backoff: + duration: 5s + factor: 2 + maxDuration: 3m diff --git a/k8s/loki/Chart.yaml b/k8s/loki/Chart.yaml new file mode 100644 index 0000000..c81b695 --- /dev/null +++ b/k8s/loki/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: loki +description: A Helm chart for Loki +type: application +version: 0.1.0 +appVersion: "1.0" diff --git a/k8s/loki/charts/loki-5.42.0.tgz b/k8s/loki/charts/loki-5.42.0.tgz new file mode 100644 index 0000000..275a0d0 Binary files /dev/null and b/k8s/loki/charts/loki-5.42.0.tgz differ diff --git a/k8s/loki/values.yaml b/k8s/loki/values.yaml new file mode 100644 index 0000000..5484313 --- /dev/null +++ b/k8s/loki/values.yaml @@ -0,0 +1,51 @@ +loki: + config: | + auth_enabled: false + server: + http_listen_port: 3100 + ingester: + lifecycler: + address: 127.0.0.1 + ring: + kvstore: + store: inmemory + replication_factor: 1 + schema_config: + configs: + - from: 2020-10-24 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h + storage_config: + boltdb_shipper: + active_index_directory: /data/loki/index + shared_store: filesystem + filesystem: + directory: /data/loki/chunks + chunk_store_config: + max_look_back_period: 672h + table_manager: + retention_deletes_enabled: true + retention_period: 672h + +singleBinary: + replicas: 1 + persistence: + enabled: true + size: 20Gi + storageClass: "linode-block-storage-retain" + +read: + replicas: 0 + +write: + replicas: 0 + +backend: + replicas: 0 + +gateway: + enabled: false diff --git a/k8s/monitoring/values-production.yaml b/k8s/monitoring/values-production.yaml index a854c16..0db88e3 100644 --- a/k8s/monitoring/values-production.yaml +++ b/k8s/monitoring/values-production.yaml @@ -131,6 +131,30 @@ kube-prometheus-stack: requests: storage: 30Gi + config: + global: + resolve_timeout: 5m + smtp_smarthost: 'email-smtp.us-east-1.amazonaws.com:587' + smtp_from: 'support@jamkazam.com' + smtp_auth_username: 'ses-smtp-user.20251206-174105' + smtp_auth_password: 'BEeyqbF7U/2BvCxXVU672geq1c9fXKisAw+gM5J+vaZi' + smtp_require_tls: true + route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'null' + routes: + - match: + alertname: WebrtcBeCrashed + receiver: 'email-alerts' + receivers: + - name: 'null' + - name: 'email-alerts' + email_configs: + - to: 'alerts@jamkazam.com' + send_resolved: true grafana: persistence: enabled: true diff --git a/k8s/monitoring/values-staging.yaml b/k8s/monitoring/values-staging.yaml index ab90ca4..95c1666 100644 --- a/k8s/monitoring/values-staging.yaml +++ b/k8s/monitoring/values-staging.yaml @@ -131,6 +131,30 @@ kube-prometheus-stack: requests: storage: 30Gi + config: + global: + resolve_timeout: 5m + smtp_smarthost: 'email-smtp.us-east-1.amazonaws.com:587' + smtp_from: 'support@jamkazam.com' + smtp_auth_username: 'ses-smtp-user.20251206-174105' + smtp_auth_password: 'BEeyqbF7U/2BvCxXVU672geq1c9fXKisAw+gM5J+vaZi' + smtp_require_tls: true + route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'null' + routes: + - match: + alertname: WebrtcBeCrashed + receiver: 'email-alerts' + receivers: + - name: 'null' + - name: 'email-alerts' + email_configs: + - to: 'alerts@jamkazam.com' + send_resolved: true grafana: persistence: enabled: true diff --git a/k8s/promtail/Chart.yaml b/k8s/promtail/Chart.yaml new file mode 100644 index 0000000..3d30d2d --- /dev/null +++ b/k8s/promtail/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: promtail +description: A Helm chart for Promtail +type: application +version: 0.1.0 +appVersion: "1.0" diff --git a/k8s/promtail/charts/promtail-6.15.3.tgz b/k8s/promtail/charts/promtail-6.15.3.tgz new file mode 100644 index 0000000..410de2f Binary files /dev/null and b/k8s/promtail/charts/promtail-6.15.3.tgz differ diff --git a/k8s/promtail/values.yaml b/k8s/promtail/values.yaml new file mode 100644 index 0000000..d4cf868 --- /dev/null +++ b/k8s/promtail/values.yaml @@ -0,0 +1,4 @@ +promtail: + config: + clients: + - url: http://loki.loki.svc:3100/loki/api/v1/push diff --git a/k8s/webrtc-be/templates/alerts.yaml b/k8s/webrtc-be/templates/alerts.yaml new file mode 100644 index 0000000..aa7e916 --- /dev/null +++ b/k8s/webrtc-be/templates/alerts.yaml @@ -0,0 +1,29 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: webrtc-be-log-alerts + labels: + app: webrtc-be +spec: + groups: + - name: webrtc-be.alerts + rules: + - alert: WebrtcBeError + expr: 'sum(count_over_time({container="webrtc-be", namespace="webrtc-be"} |= "error" [5m])) > 0' + for: 1m + labels: + severity: critical + annotations: + summary: "Errors found in webrtc-be logs" + description: "The webrtc-be container is logging errors. Please check the logs." + loki_link: >- + {{ .Values.grafana.externalUrl }}/explore?orgId=1&left=["now-1h","now","Loki",{"expr":"{container=\"webrtc-be\", namespace=\"webrtc-be\"}"}] + + - alert: WebrtcBeCrashed + expr: increase(kube_pod_container_status_restarts_total{container="webrtc-be", namespace="webrtc-be"}[5m]) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "webrtc-be crashed" + description: "The webrtc-be pod has crashed. Please check the logs."