From 252ac78888705863015590c85306dac245e2940d Mon Sep 17 00:00:00 2001 From: OdooSky v3 Date: Mon, 4 May 2026 13:09:47 +0300 Subject: [PATCH] =?UTF-8?q?feat(slice=202B.1.1):=20sync=20waves=20?= =?UTF-8?q?=E2=80=94=20kill=20the=20cert-manager-webhook=20race=20(chart?= =?UTF-8?q?=200.5.5)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Chart.yaml | 4 ++-- templates/cluster-issuer.yaml | 19 ++++++++++++++++--- templates/tenants-wildcard-cert.yaml | 8 ++++++++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/Chart.yaml b/Chart.yaml index 9fdb60d..17a40ff 100644 --- a/Chart.yaml +++ b/Chart.yaml @@ -23,8 +23,8 @@ description: | Git). type: application -version: 0.5.4 -appVersion: "0.5.4" +version: 0.5.5 +appVersion: "0.5.5" dependencies: - name: cert-manager diff --git a/templates/cluster-issuer.yaml b/templates/cluster-issuer.yaml index 5eb9c3f..d421d52 100644 --- a/templates/cluster-issuer.yaml +++ b/templates/cluster-issuer.yaml @@ -18,13 +18,26 @@ # tenant's per-tenant Vault credential (v3/tenants//cloudflare-token). # The chart references it by name only. # -# Sync wave: needs to land AFTER cert-manager's CRDs are installed -# (cert-manager dep installs first); Argo's default ordering by kind -# handles this. +# Sync wave 1 (Slice 2B.1.1, 2026-05-04). cert-manager itself +# installs at the default wave 0; Argo waits for ALL wave-0 +# resources (cert-manager Deployments + webhook Service) to be +# Healthy before applying wave 1. Without this we hit a race: +# Argo applied this ClusterIssuer in the same wave as cert-manager +# Deployments → cert-manager-webhook wasn't Ready yet → admission +# webhook rejected the resource → Argo backed off exponentially +# 30-90s before retrying. retries=2 was the smoking gun in the +# demo-server105 timing analysis (3 min ready instead of ~45 s). +# +# Note ordering: ClusterIssuer at wave 1, Certificate at wave 2 +# (in tenants-wildcard-cert.yaml) — Certificate references the +# ClusterIssuer by name, so the resource graph also reflects the +# logical dependency. apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: name: letsencrypt-prod + annotations: + argocd.argoproj.io/sync-wave: "1" labels: app.kubernetes.io/managed-by: cluster-platform-v3 spec: diff --git a/templates/tenants-wildcard-cert.yaml b/templates/tenants-wildcard-cert.yaml index 96fa445..1d7e22d 100644 --- a/templates/tenants-wildcard-cert.yaml +++ b/templates/tenants-wildcard-cert.yaml @@ -74,6 +74,14 @@ metadata: # registry) are up, instead of waiting 5–10 min for LE to # finish the wildcard issuance. argocd.argoproj.io/sync-options: SkipHealthCheck=true + # Slice 2B.1.1 — wave 2: apply AFTER the ClusterIssuer + # (wave 1) which depends on cert-manager (wave 0 default). + # Argo enforces strict wave ordering with health-gating + # between waves, so the Certificate never lands before its + # ClusterIssuer exists or before cert-manager-webhook is + # accepting admission requests. Eliminates the retries=2 + # exponential-backoff penalty observed on demo-server105. + argocd.argoproj.io/sync-wave: "2" spec: secretName: {{ printf "tenants-wildcard%s-tls" $suffix | quote }} issuerRef: