diff --git a/Chart.yaml b/Chart.yaml index a2714d6..9fdb60d 100644 --- a/Chart.yaml +++ b/Chart.yaml @@ -23,8 +23,8 @@ description: | Git). type: application -version: 0.5.3 -appVersion: "0.5.3" +version: 0.5.4 +appVersion: "0.5.4" dependencies: - name: cert-manager diff --git a/templates/tenants-wildcard-cert.yaml b/templates/tenants-wildcard-cert.yaml index 0f5d15e..96fa445 100644 --- a/templates/tenants-wildcard-cert.yaml +++ b/templates/tenants-wildcard-cert.yaml @@ -16,11 +16,28 @@ # DNS not yet pointed) waits in the data layer; the chart doesn't # try to issue and stall the whole sync. # -# DNS-01 takes 30–90 s in normal Cloudflare conditions; cert-manager -# retries forever on transient failures. The Argo Application that -# deploys this chart is "Healthy" only when EVERY Certificate's -# Ready condition flips to True — multi-domain deploys take a -# proportionally longer first sync. +# DNS-01 takes 30–90 s on a fast day, 5–10 min on a slow one +# (Cloudflare zone propagation + LE order processing). Until Slice +# 2B.1 (2026-05-04) the wildcard Certificate's Ready status gated +# the entire Argo Application's Health — meaning Connect Server +# sat at "Provisioning…" for the full 5–10 min before substrate +# became "Ready", even though all the BASE infra (longhorn, +# cert-manager, traefik, registry) was up within ~30 s. +# +# The annotation `argocd.argoproj.io/sync-options: SkipHealthCheck=true` +# below tells Argo "still sync this resource, but don't include +# its Ready status when computing the parent Application's Health". +# Result: substrate becomes Ready in ~30 s; the wildcard issues in +# the background. +# +# Tradeoff: an instance deployed inside the first ~5 min after +# Connect references a Secret (`tenants-wildcard-tls`) that doesn't +# exist yet — its IngressRoute is healthy but TLS is unavailable. +# Slice 2B.2 will plumb a per-host HTTP-01 fallback so the very +# first deploy is also fast. Until then the operator should know: +# Substrate Ready ≠ wildcard ready. Watch for the Secret to appear +# (`kubectl -n tenants get secret tenants-wildcard-tls`) before the +# first deploy on a fresh cluster. {{- $domains := .Values.tenant.domains | default (list) }} {{- if and (eq (len $domains) 0) .Values.tenant.wildcardHost }} {{- $domains = list (dict @@ -47,6 +64,16 @@ metadata: {{- if $d.primary }} odoosky.io/domain-primary: "true" {{- end }} + annotations: + # Slice 2B.1 — substrate Ready in ~30 s. Argo will still + # sync this Certificate (cert-manager will issue it via + # DNS-01 in the background), but its Ready condition does + # NOT gate the parent Application's Health calculation. So + # the cluster-platform-v3 App flips Healthy as soon as the + # base components (longhorn + cert-manager + traefik + + # registry) are up, instead of waiting 5–10 min for LE to + # finish the wildcard issuance. + argocd.argoproj.io/sync-options: SkipHealthCheck=true spec: secretName: {{ printf "tenants-wildcard%s-tls" $suffix | quote }} issuerRef: