From 595dd4fc276ac78ab5c5e38150c2c4be7df8397c Mon Sep 17 00:00:00 2001
From: OdooSky Tower <tower@odoosky.org>
Date: Thu, 14 May 2026 18:19:05 +0200
Subject: [PATCH] =?UTF-8?q?Revert=20"fix:=20chart=200.7.5=20=E2=80=94=20ra?=
 =?UTF-8?q?ise=20KEDA=20interceptor=20cold-start=20timeouts=20(waitTimeout?=
 =?UTF-8?q?=20180s,=20responseHeaderTimeout=2030s)"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit f64c374eda2d58bc8640b73a3194020c520df8b6.
---
 Chart.yaml  | 10 +++----
 values.yaml | 81 ++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 66 insertions(+), 25 deletions(-)

diff --git a/Chart.yaml b/Chart.yaml
index b64b59c..8707059 100644
--- a/Chart.yaml
+++ b/Chart.yaml
@@ -23,14 +23,14 @@ description: |
       Git).
 
 type: application
-version: 0.7.5
-appVersion: "0.7.5"
+version: 0.7.7
+appVersion: "0.7.7"
 
 # All 6 subcharts now resolve from registry.odoosky.cloud (mirrored
 # 2026-05-08, KEDA stack added 2026-05-09). Mirror-first discipline
-# for regional / air-gapped readiness: a Jetstack / Traefik /
-# Longhorn / external-secrets-io / KEDA outage no longer blocks new
-# tenant cluster bootstrap.
+# + China-region readiness: a Jetstack / Traefik / Longhorn /
+# external-secrets-io / KEDA outage no longer blocks new tenant
+# cluster bootstrap.
 #
 # Original upstream sources (for re-sync if a chart bumps):
 #   cert-manager     → https://charts.jetstack.io
diff --git a/values.yaml b/values.yaml
index c77d347..0c9b61b 100644
--- a/values.yaml
+++ b/values.yaml
@@ -148,6 +148,15 @@ traefik:
   # every TCP-80 request bounce to TCP-443 with a 301 — applies
   # uniformly to all IngressRoutes on this cluster, no per-instance
   # Middleware or duplicate IngressRoute needed.
+  # KEDA HTTP add-on routing — Studio chart's IngressRoute lives in
+  # the per-instance tenants namespace but its backend Service lives
+  # in odoosky-system (where KEDA was installed by this chart in 0.7.4).
+  # Without allowCrossNamespace=true, Traefik silently returns 404 on
+  # the cross-ns reference. Enabling here unblocks every Studio across
+  # every tenant on every cluster — single platform-wide setting.
+  providers:
+    kubernetesCRD:
+      allowCrossNamespace: true
   ports:
     web:
       redirectTo:
@@ -326,6 +335,35 @@ external-secrets:
 # Subchart values pass through under the dep name (`keda:`) below.
 keda:
   enabled: true
+  # crds.install — explicit true. KEDA's chart default is true, but
+  # somewhere in the parent-subchart values merge our keda subchart
+  # was resolving crds.install=false on a fresh render — leaving CRDs
+  # absent. Without ScaledObject CRD installed, the operator pod
+  # crashloops at startup ("failed to wait for scaledobject caches
+  # to sync") and Argo's apply of the keda-add-ons-http interceptor's
+  # ScaledObject fails ("no matches for kind"). The whole platform
+  # sync stalls there, the wildcard cert is never issued, and Tower's
+  # UI flips to "Failed - TLS cert renewal failing -739746d left"
+  # (Tower computes notAfter - now() against Go's zero-time when the
+  # Secret is missing, hence the bizarre negative-day display).
+  # Repro: havari-server03 onboarding 2026-05-10. Manual unblock was
+  # `kubectl apply --server-side --force-conflicts -f keda-crds.yaml`.
+  # This explicit true codifies that step into the chart so any
+  # newly-onboarded server gets CRDs on first sync.
+  # KEDA puts CRDs in templates/crds/ (not chart/crds/), so they
+  # need to flow through helm template — which they do once this is
+  # set. ArgoCD's App-level ServerSideApply=true sync option SHOULD
+  # cover the 262 KiB annotation overflow these CRDs would hit with
+  # client-side apply (scaledjobs + scaledobjects each blow the limit
+  # because their schemas are huge), but ArgoCD's App-level setting
+  # falls back to client-side for resources that already exist with
+  # a different field-manager. Belt-and-suspenders: also annotate
+  # each CRD per-resource so the SSA path is taken regardless of
+  # who owned the field-manager first.
+  crds:
+    install: true
+    additionalAnnotations:
+      argocd.argoproj.io/sync-options: ServerSideApply=true
   # operator + adapter + webhook — keep CPU/RAM modest. KEDA polls
   # event sources every pollingInterval (default 30s); on a cluster
   # with no ScaledObjects it does no work.
@@ -371,6 +409,29 @@ kedaHttpAddon:
 # buffers each cold-start request until the target pod is Ready.
 # The scaler is the control loop watching HTTPScaledObject status.
 keda-add-ons-http:
+  # kube-rbac-proxy sidecar — upstream HTTP add-on 0.8.0 references
+  # gcr.io/kubebuilder/kube-rbac-proxy:v0.13.0 which was retired from
+  # gcr.io. Override to a current image mirrored to our registry
+  # (multi-arch preserved via crane copy from quay.io/brancz upstream).
+  # Without this override the controller-manager pod ImagePullBackOffs
+  # forever and HTTPScaledObjects never reconcile.
+  images:
+    kubeRbacProxy:
+      name: registry.odoosky.cloud/odoosky/docker-mirror/kube-rbac-proxy
+      tag: v0.18.0
+  # imagePullSecrets — required so the kube-rbac-proxy sidecar
+  # (mirrored at registry.odoosky.cloud) can pull. The operator
+  # container itself pulls from ghcr.io which needs no auth, but
+  # k8s applies imagePullSecrets per-pod (covers all containers).
+  operator:
+    imagePullSecrets:
+      - name: docker-mirror-pull
+  # Same pattern as keda.crds.install above — explicit true so the
+  # HTTPScaledObject CRD lands on every fresh server. Without it, the
+  # interceptor never gets its watch table synced (logs: "table has
+  # not synced") and Studio cold-starts hang at "Connecting…".
+  crds:
+    install: true
   interceptor:
     replicas:
       # Scale the interceptor itself with HPA on its own metrics —
@@ -380,26 +441,6 @@ keda-add-ons-http:
       # handles bursts above that.
       min: 1
       max: 3
-      # waitTimeout (→ KEDA_CONDITION_WAIT_TIMEOUT) — how long the
-      # interceptor holds a request waiting for the target Deployment
-      # to scale 0→1 and become Ready. Upstream default is 20s; an AI
-      # Studio cold-start is image pull (~33s for the ~900 MB
-      # opencode-odoo image) + app boot (~30-60s) ≈ 60-90s, so 20s
-      # 502s the operator's first click ("context deadline exceeded
-      # while waiting for workload reach > 0 replicas"). 180s lets the
-      # first click WAIT through the cold-start instead of erroring;
-      # a warm pod is unaffected (this only caps the 0→1 wait).
-      # NOTE: the studio-template-v3 chart's per-HSO `spec.timeouts`
-      # block does NOT work — HTTPScaledObject CRD 0.8.0 has no such
-      # field; interceptor timeouts are GLOBAL, set here.
-      waitTimeout: "180s"
-    # responseHeaderTimeout (→ KEDA_RESPONSE_HEADER_TIMEOUT) — how
-    # long the interceptor waits for response headers after forwarding
-    # to a warm backend. Upstream default 500ms is far too tight for a
-    # just-woken Studio's first route load (OpenHands boots its
-    # browser env lazily); 30s is generous headroom without masking a
-    # genuinely hung backend.
-    responseHeaderTimeout: "30s"
     resources:
       requests:
         cpu: 50m