From 595dd4fc276ac78ab5c5e38150c2c4be7df8397c Mon Sep 17 00:00:00 2001 From: OdooSky Tower Date: Thu, 14 May 2026 18:19:05 +0200 Subject: [PATCH] =?UTF-8?q?Revert=20"fix:=20chart=200.7.5=20=E2=80=94=20ra?= =?UTF-8?q?ise=20KEDA=20interceptor=20cold-start=20timeouts=20(waitTimeout?= =?UTF-8?q?=20180s,=20responseHeaderTimeout=2030s)"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f64c374eda2d58bc8640b73a3194020c520df8b6. --- Chart.yaml | 10 +++---- values.yaml | 81 ++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 66 insertions(+), 25 deletions(-) diff --git a/Chart.yaml b/Chart.yaml index b64b59c..8707059 100644 --- a/Chart.yaml +++ b/Chart.yaml @@ -23,14 +23,14 @@ description: | Git). type: application -version: 0.7.5 -appVersion: "0.7.5" +version: 0.7.7 +appVersion: "0.7.7" # All 6 subcharts now resolve from registry.odoosky.cloud (mirrored # 2026-05-08, KEDA stack added 2026-05-09). Mirror-first discipline -# for regional / air-gapped readiness: a Jetstack / Traefik / -# Longhorn / external-secrets-io / KEDA outage no longer blocks new -# tenant cluster bootstrap. +# + China-region readiness: a Jetstack / Traefik / Longhorn / +# external-secrets-io / KEDA outage no longer blocks new tenant +# cluster bootstrap. # # Original upstream sources (for re-sync if a chart bumps): # cert-manager → https://charts.jetstack.io diff --git a/values.yaml b/values.yaml index c77d347..0c9b61b 100644 --- a/values.yaml +++ b/values.yaml @@ -148,6 +148,15 @@ traefik: # every TCP-80 request bounce to TCP-443 with a 301 — applies # uniformly to all IngressRoutes on this cluster, no per-instance # Middleware or duplicate IngressRoute needed. + # KEDA HTTP add-on routing — Studio chart's IngressRoute lives in + # the per-instance tenants namespace but its backend Service lives + # in odoosky-system (where KEDA was installed by this chart in 0.7.4). + # Without allowCrossNamespace=true, Traefik silently returns 404 on + # the cross-ns reference. Enabling here unblocks every Studio across + # every tenant on every cluster — single platform-wide setting. + providers: + kubernetesCRD: + allowCrossNamespace: true ports: web: redirectTo: @@ -326,6 +335,35 @@ external-secrets: # Subchart values pass through under the dep name (`keda:`) below. keda: enabled: true + # crds.install — explicit true. KEDA's chart default is true, but + # somewhere in the parent-subchart values merge our keda subchart + # was resolving crds.install=false on a fresh render — leaving CRDs + # absent. Without ScaledObject CRD installed, the operator pod + # crashloops at startup ("failed to wait for scaledobject caches + # to sync") and Argo's apply of the keda-add-ons-http interceptor's + # ScaledObject fails ("no matches for kind"). The whole platform + # sync stalls there, the wildcard cert is never issued, and Tower's + # UI flips to "Failed - TLS cert renewal failing -739746d left" + # (Tower computes notAfter - now() against Go's zero-time when the + # Secret is missing, hence the bizarre negative-day display). + # Repro: havari-server03 onboarding 2026-05-10. Manual unblock was + # `kubectl apply --server-side --force-conflicts -f keda-crds.yaml`. + # This explicit true codifies that step into the chart so any + # newly-onboarded server gets CRDs on first sync. + # KEDA puts CRDs in templates/crds/ (not chart/crds/), so they + # need to flow through helm template — which they do once this is + # set. ArgoCD's App-level ServerSideApply=true sync option SHOULD + # cover the 262 KiB annotation overflow these CRDs would hit with + # client-side apply (scaledjobs + scaledobjects each blow the limit + # because their schemas are huge), but ArgoCD's App-level setting + # falls back to client-side for resources that already exist with + # a different field-manager. Belt-and-suspenders: also annotate + # each CRD per-resource so the SSA path is taken regardless of + # who owned the field-manager first. + crds: + install: true + additionalAnnotations: + argocd.argoproj.io/sync-options: ServerSideApply=true # operator + adapter + webhook — keep CPU/RAM modest. KEDA polls # event sources every pollingInterval (default 30s); on a cluster # with no ScaledObjects it does no work. @@ -371,6 +409,29 @@ kedaHttpAddon: # buffers each cold-start request until the target pod is Ready. # The scaler is the control loop watching HTTPScaledObject status. keda-add-ons-http: + # kube-rbac-proxy sidecar — upstream HTTP add-on 0.8.0 references + # gcr.io/kubebuilder/kube-rbac-proxy:v0.13.0 which was retired from + # gcr.io. Override to a current image mirrored to our registry + # (multi-arch preserved via crane copy from quay.io/brancz upstream). + # Without this override the controller-manager pod ImagePullBackOffs + # forever and HTTPScaledObjects never reconcile. + images: + kubeRbacProxy: + name: registry.odoosky.cloud/odoosky/docker-mirror/kube-rbac-proxy + tag: v0.18.0 + # imagePullSecrets — required so the kube-rbac-proxy sidecar + # (mirrored at registry.odoosky.cloud) can pull. The operator + # container itself pulls from ghcr.io which needs no auth, but + # k8s applies imagePullSecrets per-pod (covers all containers). + operator: + imagePullSecrets: + - name: docker-mirror-pull + # Same pattern as keda.crds.install above — explicit true so the + # HTTPScaledObject CRD lands on every fresh server. Without it, the + # interceptor never gets its watch table synced (logs: "table has + # not synced") and Studio cold-starts hang at "Connecting…". + crds: + install: true interceptor: replicas: # Scale the interceptor itself with HPA on its own metrics — @@ -380,26 +441,6 @@ keda-add-ons-http: # handles bursts above that. min: 1 max: 3 - # waitTimeout (→ KEDA_CONDITION_WAIT_TIMEOUT) — how long the - # interceptor holds a request waiting for the target Deployment - # to scale 0→1 and become Ready. Upstream default is 20s; an AI - # Studio cold-start is image pull (~33s for the ~900 MB - # opencode-odoo image) + app boot (~30-60s) ≈ 60-90s, so 20s - # 502s the operator's first click ("context deadline exceeded - # while waiting for workload reach > 0 replicas"). 180s lets the - # first click WAIT through the cold-start instead of erroring; - # a warm pod is unaffected (this only caps the 0→1 wait). - # NOTE: the studio-template-v3 chart's per-HSO `spec.timeouts` - # block does NOT work — HTTPScaledObject CRD 0.8.0 has no such - # field; interceptor timeouts are GLOBAL, set here. - waitTimeout: "180s" - # responseHeaderTimeout (→ KEDA_RESPONSE_HEADER_TIMEOUT) — how - # long the interceptor waits for response headers after forwarding - # to a warm backend. Upstream default 500ms is far too tight for a - # just-woken Studio's first route load (OpenHands boots its - # browser env lazily); 30s is generous headroom without masking a - # genuinely hung backend. - responseHeaderTimeout: "30s" resources: requests: cpu: 50m