diff --git a/Chart.yaml b/Chart.yaml index 8707059..b64b59c 100644 --- a/Chart.yaml +++ b/Chart.yaml @@ -23,14 +23,14 @@ description: | Git). type: application -version: 0.7.7 -appVersion: "0.7.7" +version: 0.7.5 +appVersion: "0.7.5" # All 6 subcharts now resolve from registry.odoosky.cloud (mirrored # 2026-05-08, KEDA stack added 2026-05-09). Mirror-first discipline -# + China-region readiness: a Jetstack / Traefik / Longhorn / -# external-secrets-io / KEDA outage no longer blocks new tenant -# cluster bootstrap. +# for regional / air-gapped readiness: a Jetstack / Traefik / +# Longhorn / external-secrets-io / KEDA outage no longer blocks new +# tenant cluster bootstrap. # # Original upstream sources (for re-sync if a chart bumps): # cert-manager → https://charts.jetstack.io diff --git a/values.yaml b/values.yaml index 0c9b61b..c77d347 100644 --- a/values.yaml +++ b/values.yaml @@ -148,15 +148,6 @@ traefik: # every TCP-80 request bounce to TCP-443 with a 301 — applies # uniformly to all IngressRoutes on this cluster, no per-instance # Middleware or duplicate IngressRoute needed. - # KEDA HTTP add-on routing — Studio chart's IngressRoute lives in - # the per-instance tenants namespace but its backend Service lives - # in odoosky-system (where KEDA was installed by this chart in 0.7.4). - # Without allowCrossNamespace=true, Traefik silently returns 404 on - # the cross-ns reference. Enabling here unblocks every Studio across - # every tenant on every cluster — single platform-wide setting. - providers: - kubernetesCRD: - allowCrossNamespace: true ports: web: redirectTo: @@ -335,35 +326,6 @@ external-secrets: # Subchart values pass through under the dep name (`keda:`) below. keda: enabled: true - # crds.install — explicit true. KEDA's chart default is true, but - # somewhere in the parent-subchart values merge our keda subchart - # was resolving crds.install=false on a fresh render — leaving CRDs - # absent. Without ScaledObject CRD installed, the operator pod - # crashloops at startup ("failed to wait for scaledobject caches - # to sync") and Argo's apply of the keda-add-ons-http interceptor's - # ScaledObject fails ("no matches for kind"). The whole platform - # sync stalls there, the wildcard cert is never issued, and Tower's - # UI flips to "Failed - TLS cert renewal failing -739746d left" - # (Tower computes notAfter - now() against Go's zero-time when the - # Secret is missing, hence the bizarre negative-day display). - # Repro: havari-server03 onboarding 2026-05-10. Manual unblock was - # `kubectl apply --server-side --force-conflicts -f keda-crds.yaml`. - # This explicit true codifies that step into the chart so any - # newly-onboarded server gets CRDs on first sync. - # KEDA puts CRDs in templates/crds/ (not chart/crds/), so they - # need to flow through helm template — which they do once this is - # set. ArgoCD's App-level ServerSideApply=true sync option SHOULD - # cover the 262 KiB annotation overflow these CRDs would hit with - # client-side apply (scaledjobs + scaledobjects each blow the limit - # because their schemas are huge), but ArgoCD's App-level setting - # falls back to client-side for resources that already exist with - # a different field-manager. Belt-and-suspenders: also annotate - # each CRD per-resource so the SSA path is taken regardless of - # who owned the field-manager first. - crds: - install: true - additionalAnnotations: - argocd.argoproj.io/sync-options: ServerSideApply=true # operator + adapter + webhook — keep CPU/RAM modest. KEDA polls # event sources every pollingInterval (default 30s); on a cluster # with no ScaledObjects it does no work. @@ -409,29 +371,6 @@ kedaHttpAddon: # buffers each cold-start request until the target pod is Ready. # The scaler is the control loop watching HTTPScaledObject status. keda-add-ons-http: - # kube-rbac-proxy sidecar — upstream HTTP add-on 0.8.0 references - # gcr.io/kubebuilder/kube-rbac-proxy:v0.13.0 which was retired from - # gcr.io. Override to a current image mirrored to our registry - # (multi-arch preserved via crane copy from quay.io/brancz upstream). - # Without this override the controller-manager pod ImagePullBackOffs - # forever and HTTPScaledObjects never reconcile. - images: - kubeRbacProxy: - name: registry.odoosky.cloud/odoosky/docker-mirror/kube-rbac-proxy - tag: v0.18.0 - # imagePullSecrets — required so the kube-rbac-proxy sidecar - # (mirrored at registry.odoosky.cloud) can pull. The operator - # container itself pulls from ghcr.io which needs no auth, but - # k8s applies imagePullSecrets per-pod (covers all containers). - operator: - imagePullSecrets: - - name: docker-mirror-pull - # Same pattern as keda.crds.install above — explicit true so the - # HTTPScaledObject CRD lands on every fresh server. Without it, the - # interceptor never gets its watch table synced (logs: "table has - # not synced") and Studio cold-starts hang at "Connecting…". - crds: - install: true interceptor: replicas: # Scale the interceptor itself with HPA on its own metrics — @@ -441,6 +380,26 @@ keda-add-ons-http: # handles bursts above that. min: 1 max: 3 + # waitTimeout (→ KEDA_CONDITION_WAIT_TIMEOUT) — how long the + # interceptor holds a request waiting for the target Deployment + # to scale 0→1 and become Ready. Upstream default is 20s; an AI + # Studio cold-start is image pull (~33s for the ~900 MB + # opencode-odoo image) + app boot (~30-60s) ≈ 60-90s, so 20s + # 502s the operator's first click ("context deadline exceeded + # while waiting for workload reach > 0 replicas"). 180s lets the + # first click WAIT through the cold-start instead of erroring; + # a warm pod is unaffected (this only caps the 0→1 wait). + # NOTE: the studio-template-v3 chart's per-HSO `spec.timeouts` + # block does NOT work — HTTPScaledObject CRD 0.8.0 has no such + # field; interceptor timeouts are GLOBAL, set here. + waitTimeout: "180s" + # responseHeaderTimeout (→ KEDA_RESPONSE_HEADER_TIMEOUT) — how + # long the interceptor waits for response headers after forwarding + # to a warm backend. Upstream default 500ms is far too tight for a + # just-woken Studio's first route load (OpenHands boots its + # browser env lazily); 30s is generous headroom without masking a + # genuinely hung backend. + responseHeaderTimeout: "30s" resources: requests: cpu: 50m