diff --git a/Chart.yaml b/Chart.yaml index 2f5ccef..4ce8a84 100644 --- a/Chart.yaml +++ b/Chart.yaml @@ -9,6 +9,29 @@ description: | BuildKit Jobs push addon images here; the chart consumes them as image volumes. Sovereignty + GFW resistance: no cross-cluster image transfer. + - cert-manager + Traefik (vendored via Helm dependencies) + so the substrate that used to be installed by bootstrap.sh + now lives in Git, deployed by Tower's per-cluster Argo + Application. Customer's "Connect Server" terminal stops + at "kubeconfig sent" — the slow ACME wait happens here in + the background. + - tenants Namespace + tenants-wildcard Certificate. Per-tenant + via .Values.tenant.{domain,wildcardHost}; cert-manager's + DNS-01 solver pulls the Cloudflare token from the + `cloudflare-api-token` Secret Tower kubectl-applies into the + cert-manager namespace at Connect time (secrets stay out of + Git). + type: application -version: 0.1.0 -appVersion: "0.1.0" +version: 0.2.0 +appVersion: "0.2.0" + +dependencies: + - name: cert-manager + version: "v1.16.1" + repository: "https://charts.jetstack.io" + condition: certManager.enabled + - name: traefik + version: "33.2.1" + repository: "https://traefik.github.io/charts" + condition: traefik.enabled diff --git a/charts/cert-manager-v1.16.1.tgz b/charts/cert-manager-v1.16.1.tgz new file mode 100644 index 0000000..29c681d Binary files /dev/null and b/charts/cert-manager-v1.16.1.tgz differ diff --git a/charts/traefik-33.2.1.tgz b/charts/traefik-33.2.1.tgz new file mode 100644 index 0000000..079e961 Binary files /dev/null and b/charts/traefik-33.2.1.tgz differ diff --git a/templates/cluster-issuer.yaml b/templates/cluster-issuer.yaml new file mode 100644 index 0000000..aae7284 --- /dev/null +++ b/templates/cluster-issuer.yaml @@ -0,0 +1,37 @@ +{{- if .Values.tenant.domain }} +# letsencrypt-prod ClusterIssuer — DNS-01 challenge via Cloudflare. +# Scoped to the tenant's Cloudflare zone (.Values.tenant.domain) so +# cert-manager only attempts records in zones the supplied token can +# touch — wrong-zone tokens fail loudly at issue time rather than +# silently re-trying forever. +# +# The cloudflare-api-token Secret is NOT in this chart. Tower +# kubectl-applies it into cert-manager ns at Connect time using the +# tenant's per-tenant Vault credential (v3/tenants//cloudflare-token). +# The chart references it by name only. +# +# Sync wave: needs to land AFTER cert-manager's CRDs are installed +# (cert-manager dep installs first); Argo's default ordering by +# kind handles this. +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-prod + labels: + app.kubernetes.io/managed-by: cluster-platform-v3 +spec: + acme: + email: {{ required "acme.email is required" .Values.acme.email | quote }} + server: {{ .Values.acme.server | quote }} + privateKeySecretRef: + name: letsencrypt-prod-account-key + solvers: + - dns01: + cloudflare: + apiTokenSecretRef: + name: {{ .Values.secrets.cloudflareTokenSecret.name | quote }} + key: {{ .Values.secrets.cloudflareTokenSecret.key | quote }} + selector: + dnsZones: + - {{ .Values.tenant.domain | quote }} +{{- end }} diff --git a/templates/tenants-namespace.yaml b/templates/tenants-namespace.yaml new file mode 100644 index 0000000..9a642b3 --- /dev/null +++ b/templates/tenants-namespace.yaml @@ -0,0 +1,10 @@ +# tenants Namespace — every tenant instance lives here. Created +# explicitly (rather than relied on Argo's CreateNamespace) because +# the wildcard Certificate below targets this namespace and Argo's +# sync wave can race namespace-default-creation otherwise. +apiVersion: v1 +kind: Namespace +metadata: + name: tenants + labels: + app.kubernetes.io/managed-by: cluster-platform-v3 diff --git a/templates/tenants-wildcard-cert.yaml b/templates/tenants-wildcard-cert.yaml new file mode 100644 index 0000000..0130ce1 --- /dev/null +++ b/templates/tenants-wildcard-cert.yaml @@ -0,0 +1,31 @@ +{{- if .Values.tenant.wildcardHost }} +# tenants-wildcard Certificate — issued ONCE per cluster, referenced +# by every tenant instance's IngressRoute. Avoids Let's Encrypt's +# 50-cert/week per-domain rate limit as the cluster scales to many +# instances under one tenant. +# +# DNS-01 takes 30–90 s in normal Cloudflare conditions; cert-manager +# retries forever on transient failures. The Argo Application that +# deploys this chart is "Healthy" only when the Certificate's Ready +# condition flips to True — Tower's UI uses that as the +# "Provisioning → Ready" transition for the Server card. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: tenants-wildcard + namespace: tenants + labels: + app.kubernetes.io/managed-by: cluster-platform-v3 +spec: + secretName: tenants-wildcard-tls + issuerRef: + name: letsencrypt-prod + kind: ClusterIssuer + commonName: {{ .Values.tenant.wildcardHost | quote }} + dnsNames: + - {{ .Values.tenant.wildcardHost | quote }} + # Renew 30 days before expiry — Let's Encrypt certs are 90-day, so + # this gives cert-manager a 30-day window to retry if Cloudflare + # has a bad day during renewal. + renewBefore: 720h +{{- end }} diff --git a/values.yaml b/values.yaml index 283f635..729013e 100644 --- a/values.yaml +++ b/values.yaml @@ -5,6 +5,55 @@ namespace: odoosky-system +# tenant — per-tenant identity injected by Tower as helm.values on +# the per-cluster Argo Application. Empty defaults are safe to lint +# but a real deploy MUST set domain + wildcardHost (the Certificate +# template fails with `required` on an empty value). +tenant: + # Domain the Cloudflare zone covers, e.g. "acme-erp.com". + domain: "" + # Wildcard hostname the cluster-wide tenants-wildcard cert covers, + # e.g. "*.tenants.acme-erp.com". Every tenant instance Ingress + # references the resulting Secret (`tenants-wildcard-tls` in the + # `tenants` namespace) by name. + wildcardHost: "" + +# acme — Let's Encrypt registration. Operator email is per-platform, +# not per-tenant. +acme: + email: m@havari.me + server: https://acme-v02.api.letsencrypt.org/directory + +# certManager — the upstream jetstack chart, pinned at v1.16.1 by +# Chart.yaml's dependency. We turn on CRDs + force the namespace so +# the ClusterIssuer template below can reference solver Secrets in +# `cert-manager` ns. +certManager: + enabled: true + installCRDs: true + +# traefik — upstream chart. LoadBalancer Service so the customer's +# k3s servicelb maps :80/:443 to the host. Tower currently doesn't +# rely on Traefik's IngressRoute features here; instances are on +# their own per-tenant Traefik later. This Traefik gives the cluster +# a default ingress for the registry + future platform endpoints. +traefik: + enabled: true + service: + type: LoadBalancer + +# secrets — Tower applies these out-of-band via the registered +# kubeconfig at Connect time (B2). The chart references them by +# name only; values never enter Git. +secrets: + cloudflareTokenSecret: + namespace: cert-manager + name: cloudflare-api-token + key: api-token + s3CredentialsSecret: + namespace: tenants + name: s3-backup-creds + registry: enabled: true image: