# cluster-platform-v3 — defaults. # # Most knobs you'd flip live here so customer-cluster overlays can # tune sizing without forking the chart. namespace: odoosky-system # cluster — per-cluster identity passed by Tower as helm.values on each # per-cluster Application. The chart uses cluster.name to add a # differentiator SAN to the tenants-wildcard Certificate so Lets # Encrypts duplicate-cert rate limit doesnt collide across one # tenants multiple clusters. cluster: name: "" # tenant — per-tenant identity injected by Tower as helm.values on # the per-cluster Argo Application. Empty defaults are safe to lint # but a real deploy MUST set domain + wildcardHost (the Certificate # template fails with `required` on an empty value). tenant: # Tenant UUID — used by ESO ExternalSecrets to construct the # OpenBao path `v3/tenants//{cloudflare-token,s3-credentials}`. # Empty default = ESO ExternalSecret manifests skip rendering (chart # remains usable for non-ESO clusters during transition). id: "" # Tenant slug — used as the per-tenant Secret name suffix # (e.g., `cloudflare-api-token-`). Must match the slug # cert-manager's ClusterIssuer references via secrets.cloudflareTokenSecret. slug: "" # S3-compatible endpoint for the tenant's backup target. When set, # the longhorn-s3-creds ExternalSecret manifest renders with # AWS_ENDPOINTS literal alongside the access_key+secret_key from # OpenBao. Empty = no Longhorn S3 backup wired (instance-level # backups still work via s3-backup-creds + the per-tenant CronJob). s3Endpoint: "" # Domain the Cloudflare zone covers, e.g. "acme-erp.com". # Mirror of domains[primary].root — kept for legacy chart consumers. domain: "" # Wildcard hostname the cluster-wide tenants-wildcard cert covers, # e.g. "*.tenants.acme-erp.com". Mirror of domains[primary].wildcardHost. # Every tenant instance Ingress references the resulting Secret # (`tenants-wildcard-tls` in the `tenants` namespace) by name. wildcardHost: "" # domains — full multi-domain list (#320.C). Tower passes one entry # per domain the tenant has registered; the chart issues one # wildcard Certificate per VERIFIED entry. The primary entry # produces the canonical `tenants-wildcard-tls` Secret; non-primary # entries get `tenants-wildcard--tls`. # Empty list = legacy single-domain mode (chart synthesizes one # entry from domain + wildcardHost above). # # Each entry shape: # - root: "acme.com" # - wildcardHost: "*.tenants.acme.com" # - primary: true # exactly one entry should be primary # - verified: true # chart skips entries with verified=false domains: [] # acme — Let's Encrypt registration. Operator email is per-platform, # not per-tenant. acme: email: m@havari.me server: https://acme-v02.api.letsencrypt.org/directory # certManager — gate for the conditional in Chart.yaml dependencies. # Helm reads this for the `condition: certManager.enabled` flag only; # the actual subchart values live below under the dep name `cert-manager`. certManager: enabled: true # injectedWildcards — Slice 2B.3 (2026-05-04). Tower's per-tenant # Vault-stash flow harvests successfully-issued wildcard cert # Secrets and re-injects them on Reconnect to bypass Let's Encrypts # 5-cert/identifier/168h rate limit. When an entry is present # for a tenant.domains[].root, the chart: # - SKIPS the cert-manager Certificate resource for that root # (so no ACME order is placed) # - Renders a kubernetes.io/tls Secret with the injected crt/key # under the SAME name the cert-manager path would have used # (`tenants-wildcard-tls` for primary, `tenants-wildcard--tls` otherwise) so existing # IngressRoutes don't need to change. # Empty list = legacy ACME-only path. Per-domain — a tenant can # mix injected + ACME-issued certs across multiple roots. # # Each entry shape: # - root: "acme.com" # - primary: true # mirrors tenant.domains[i].primary # - crt: "" # - key: "" injectedWildcards: [] # cert-manager — values passed THROUGH to the upstream jetstack subchart # (Chart.yaml dependency name = "cert-manager"). Subchart values must # nest under the dep name, not under our top-level `certManager` alias — # putting them under `certManager:` does nothing. # # crds.enabled — install the cert-manager CRDs in the same release. The # v1.14+ jetstack chart renamed `installCRDs` to `crds.enabled`; the # old key is silently ignored, leaving the CRDs absent and any # Certificate / ClusterIssuer manifest failing with "no matches for kind". # crds.keep — leave CRDs in place if the chart is uninstalled. Safer for # disconnect flows where the customer might re-add the cluster later. cert-manager: crds: enabled: false keep: false # ignored when enabled=false # startupapicheck — disabled (Slice 2B.1.2, 2026-05-04). The # subchart includes a Job that runs as a PostSync hook and tries # to verify cert-manager's API is responsive by issuing a test # cert through it. Two real costs once cert-manager is proven on # the platform: # 1. The Job's PostSync hook gates Argo's sync from completing. # On every chart sync (not just install), Argo waits for the # Job to succeed before flipping the App to Synced. # 2. When the wildcard Cert is in error (e.g. LE rate limit), # the Job adds even more retry overhead — Argo loops forever. # We're not adopting cert-manager fresh — every connect ships the # same proven version, the install API surface is stable. The # check is dead-weight that masks the actual install timing. startupapicheck: enabled: false # traefik — upstream chart. LoadBalancer Service so the customer's # k3s servicelb maps :80/:443 to the host. Tower currently doesn't # rely on Traefik's IngressRoute features here; instances are on # their own per-tenant Traefik later. This Traefik gives the cluster # a default ingress for the registry + future platform endpoints. traefik: enabled: true service: type: LoadBalancer # Platform-level HTTP → HTTPS redirect. Without this, browsers that # default to http:// on a bare hostname hit Traefik's `web` # entrypoint with no matching IngressRoute and get the built-in # "404 page not found". Enabling redirectTo at the entrypoint makes # every TCP-80 request bounce to TCP-443 with a 301 — applies # uniformly to all IngressRoutes on this cluster, no per-instance # Middleware or duplicate IngressRoute needed. ports: web: redirectTo: port: websecure priority: 10 # secrets — DEPRECATED for cloudflareTokenSecret as of chart 0.7.3. # The cluster-issuer.yaml template now hard-references # `cloudflare-api-token-` (matches the ESO-created Secret # in cloudflare-api-token-externalsecret.yaml) and ignores this block. # Kept here as no-op back-compat for any external chart consumer that # overrides these values; chart templates no longer read # secrets.cloudflareTokenSecret. s3CredentialsSecret is still consumed # by the per-instance backup CronJob path and remains live. secrets: cloudflareTokenSecret: namespace: odoosky-system name: cloudflare-api-token # unused since 0.7.3; chart computes from tenant.slug key: api-token # unused since 0.7.3 s3CredentialsSecret: namespace: tenants name: s3-backup-creds registry: enabled: true image: repository: registry tag: "2.8" pullPolicy: IfNotPresent # ClusterIP service hostname: # registry.odoosky-system.svc.cluster.local:5000 # Used internally by build Jobs (push) and the Odoo Deployment's # image volumes (pull). Plain HTTP — the registry never sees # off-cluster traffic; node-side k3s registries.yaml whitelists # the hostname for HTTP image pulls. service: port: 5000 # NodePort the kubelet on each node uses to reach the registry # (via the host-side 127.0.0.1: mirror entry in # /etc/rancher/k3s/registries.yaml). Picked outside the default # 30000-32767 NodePort range's busy zone; change if the cluster # already uses 30500 for something else. nodePort: 30500 # Storage. The registry survives node restarts but is recreatable — # if the PVC is wiped, Tower's ensureAddonImage will rebuild any # missing images from Gitea source on demand. So we don't need a # large or replicated PV here. persistence: enabled: false size: 10Gi storageClass: "" # "" = use the cluster's default; on k3s that's local-path resources: requests: cpu: 50m memory: 64Mi limits: cpu: 500m memory: 256Mi # longhorn — CSI block storage. See ADR 0003 (odooskyv3 monorepo) for # the full design. Phase 1 (this commit): declared but disabled. # Per-server enablement happens via the per-cluster Argo Application's # helm.parameters (set `longhorn.enabled=true`). # # Host prerequisites (already satisfied on bootstrap.sh-Connect'd # servers): `open-iscsi` package + `iscsi_tcp` kernel module + # `iscsid` service. Servers provisioned out-of-band must run # `apt-get install -y open-iscsi && modprobe iscsi_tcp && # systemctl enable --now iscsid` before flipping enabled=true. # # When `longhorn.enabled=true`, the chart additionally renders: # - StorageClass `longhorn-tenants` (replicaCount = .replicas) # - VolumeSnapshotClass `longhorn-snapshot-class` for the future # VolumeClone Refresh ↓ path # Existing instances on `local-path` are unaffected — Longhorn # co-exists, doesn't replace local-path. # csiSnapshotter — vendored kubernetes-csi/external-snapshotter # v8.1.0. Provides the standard `snapshot.storage.k8s.io/v1` CRDs # + snapshot-controller. Required for Tower's CSI VolumeClone path # (Refresh ↓ + spawn-env seed). See ADR 0003 phase 3. # # Only needed when Longhorn (or any other snapshot-capable CSI # driver) is in use; default true so future server connects get the # substrate ready out of the box. csiSnapshotter: enabled: true longhorn: enabled: false # Replicas per Longhorn volume. Standard tier (single server) = # 1 — durability story is async S3 backup, not local replicas. # HA-Active sets this to 2 across the cluster's worker nodes. replicas: 1 # Phase 5 of ADR 0003 — Longhorn's own settings, passed straight # through to the subchart's `defaultSettings`. The two-layer design: # # 1. Local CoW snapshots (Longhorn `task: snapshot`) — instant, # zero-blocking, hourly retention. Used for fast undo. # 2. Async S3 backup (Longhorn `task: backup`) — block-incremental # upload to tenant's bucket, gradual, never blocks workflow. # Daily retention. The DR layer alongside Tower's existing # application-level pg_dump backup (which is for cross-cluster # migration; Longhorn-S3 is for fast same-cluster restore). # # The RecurringJob CRDs that drive both layers live in # templates/longhorn-recurringjobs.yaml and bind to all volumes # via the `default` group automatically. defaultSettings: defaultDataPath: /var/lib/longhorn # backupTarget — set this per-server via the Argo App's helm # parameters to enable the async S3 backup channel. Format: # `s3://@//`. Empty = local snapshots # only (local layer still works; just no off-cluster copy). backupTarget: "" # backupTargetCredentialSecret — name of K8s Secret in the # `longhorn-system` namespace carrying AWS_ACCESS_KEY_ID + # AWS_SECRET_ACCESS_KEY. Operator kubectl-applies it once per # cluster (same pattern as cloudflare-api-token). Cross-namespace # Secret references aren't allowed by Longhorn. backupTargetCredentialSecret: "" # Disable the Helm pre-upgrade checker Job. It's annotated as a # `helm.sh/hook: pre-upgrade,pre-install` which Argo translates to # PreSync — but the Job's ServiceAccount lives in the regular sync # phase, so the Job fails ("ServiceAccount not found") before the # SA gets created. Argo's sync model already gives us proper # ordering on regular resources; the safety check is redundant. preUpgradeChecker: jobEnabled: false upgradeVersionCheck: false # Don't mark Longhorn's bundled StorageClass as cluster-default. # k3s ships local-path as default; we keep it that way. New # instances stay on local-path unless Tower explicitly stamps # storageClassName=longhorn-tenants on their PVCs (Phase 6 of # ADR 0003 will do that). Two `default` storage classes is a # k8s misconfig — silently picks one for unscoped PVCs. persistence: defaultClass: false defaultClassReplicaCount: 1 # externalSecrets — pilot delivery path for platform-scope Secrets # previously kubectl-stamped by Tower. 2026-05-07 Phase 1 scope: # `gitea-archive-pull` only. The other 4 Tower-stamped Secrets stay # on the legacy path until a dedicated sprint (Item #9, v3 open # queue). # # .openbao.mountPath is per-cluster — Tower passes this as a helm # parameter so each tenant cluster's ESO authenticates against its # own OpenBao auth mount (`auth/kubernetes-`). Empty default # means "off"; new clusters with no Tower wiring stay legacy. externalSecrets: enabled: true openbao: server: "https://vault.odoosky.org" mountPath: "" # Tower fills this per-cluster, e.g. "kubernetes-customer1" role: "eso" # external-secrets — values passed THROUGH to the upstream subchart # (Chart.yaml dependency name = "external-secrets"). CRDs install on # first apply. Resource limits conservative — ESO is event-driven # and idle most of the time. # # We keep the subchart's default release-prefixed naming # (`-external-secrets`) — i.e., we DON'T set # fullnameOverride. The ClusterSecretStore manifest references the # SA via `{{ .Release.Name }}-external-secrets` so the name resolves # correctly per-cluster. external-secrets: installCRDs: true # keda — event-driven autoscaler. Gate for the conditional in # Chart.yaml dependencies. Enabled by default so all clusters can # host AI Studio (per-instance OpenCode pods that scale 0↔1 via the # HTTP add-on below). KEDA's control plane is ~50 MB RAM idle — # negligible for clusters that never spawn a Studio. # # Subchart values pass through under the dep name (`keda:`) below. keda: enabled: true # operator + adapter + webhook — keep CPU/RAM modest. KEDA polls # event sources every pollingInterval (default 30s); on a cluster # with no ScaledObjects it does no work. operator: resources: requests: cpu: 50m memory: 100Mi limits: cpu: 500m memory: 256Mi metricsServer: resources: requests: cpu: 50m memory: 100Mi limits: cpu: 500m memory: 256Mi webhooks: resources: requests: cpu: 50m memory: 50Mi limits: cpu: 200m memory: 128Mi # kedaHttpAddon — gate for the HTTP add-on subchart. Same # enabled-by-default rationale: idle interceptor is small, and only # clusters with active Studios route traffic through it. # # The add-on installs the HTTPScaledObject CRD + an interceptor Service # in the `keda` namespace at `keda-add-ons-http-interceptor-proxy`. # Studio-template-v3's IngressRoute targets that interceptor by name # (it figures out which Studio Deployment to wake by Host header # matched against HTTPScaledObject.spec.hosts). kedaHttpAddon: enabled: true # keda-add-ons-http — values passed THROUGH to the HTTP add-on # subchart. The interceptor is the request-handling hot path; it # buffers each cold-start request until the target pod is Ready. # The scaler is the control loop watching HTTPScaledObject status. keda-add-ons-http: interceptor: replicas: # Scale the interceptor itself with HPA on its own metrics — # not zero (it must always be reachable to wake other pods). # 1 replica is fine for OdooSky's per-customer-cluster load # (single-digit Studios per cluster); upstream's own HPA # handles bursts above that. min: 1 max: 3 resources: requests: cpu: 50m memory: 100Mi limits: cpu: 500m memory: 256Mi scaler: resources: requests: cpu: 50m memory: 64Mi limits: cpu: 200m memory: 128Mi