414 lines
18 KiB
YAML
414 lines
18 KiB
YAML
# cluster-platform-v3 — defaults.
|
|
#
|
|
# Most knobs you'd flip live here so customer-cluster overlays can
|
|
# tune sizing without forking the chart.
|
|
|
|
namespace: odoosky-system
|
|
|
|
|
|
# cluster — per-cluster identity passed by Tower as helm.values on each
|
|
# per-cluster Application. The chart uses cluster.name to add a
|
|
# differentiator SAN to the tenants-wildcard Certificate so Lets
|
|
# Encrypts duplicate-cert rate limit doesnt collide across one
|
|
# tenants multiple clusters.
|
|
cluster:
|
|
name: ""
|
|
|
|
# tenant — per-tenant identity injected by Tower as helm.values on
|
|
# the per-cluster Argo Application. Empty defaults are safe to lint
|
|
# but a real deploy MUST set domain + wildcardHost (the Certificate
|
|
# template fails with `required` on an empty value).
|
|
tenant:
|
|
# Tenant UUID — used by ESO ExternalSecrets to construct the
|
|
# OpenBao path `v3/tenants/<id>/{cloudflare-token,s3-credentials}`.
|
|
# Empty default = ESO ExternalSecret manifests skip rendering (chart
|
|
# remains usable for non-ESO clusters during transition).
|
|
id: ""
|
|
# Tenant slug — used as the per-tenant Secret name suffix
|
|
# (e.g., `cloudflare-api-token-<slug>`). Must match the slug
|
|
# cert-manager's ClusterIssuer references via secrets.cloudflareTokenSecret.
|
|
slug: ""
|
|
# S3-compatible endpoint for the tenant's backup target. When set,
|
|
# the longhorn-s3-creds ExternalSecret manifest renders with
|
|
# AWS_ENDPOINTS literal alongside the access_key+secret_key from
|
|
# OpenBao. Empty = no Longhorn S3 backup wired (instance-level
|
|
# backups still work via s3-backup-creds + the per-tenant CronJob).
|
|
s3Endpoint: ""
|
|
# Domain the Cloudflare zone covers, e.g. "acme-erp.com".
|
|
# Mirror of domains[primary].root — kept for legacy chart consumers.
|
|
domain: ""
|
|
# Wildcard hostname the cluster-wide tenants-wildcard cert covers,
|
|
# e.g. "*.tenants.acme-erp.com". Mirror of domains[primary].wildcardHost.
|
|
# Every tenant instance Ingress references the resulting Secret
|
|
# (`tenants-wildcard-tls` in the `tenants` namespace) by name.
|
|
wildcardHost: ""
|
|
# domains — full multi-domain list (#320.C). Tower passes one entry
|
|
# per domain the tenant has registered; the chart issues one
|
|
# wildcard Certificate per VERIFIED entry. The primary entry
|
|
# produces the canonical `tenants-wildcard-tls` Secret; non-primary
|
|
# entries get `tenants-wildcard-<root-with-dots-as-dashes>-tls`.
|
|
# Empty list = legacy single-domain mode (chart synthesizes one
|
|
# entry from domain + wildcardHost above).
|
|
#
|
|
# Each entry shape:
|
|
# - root: "acme.com"
|
|
# - wildcardHost: "*.tenants.acme.com"
|
|
# - primary: true # exactly one entry should be primary
|
|
# - verified: true # chart skips entries with verified=false
|
|
domains: []
|
|
|
|
# acme — Let's Encrypt registration. Operator email is per-platform,
|
|
# not per-tenant.
|
|
acme:
|
|
email: m@havari.me
|
|
server: https://acme-v02.api.letsencrypt.org/directory
|
|
|
|
# certManager — gate for the conditional in Chart.yaml dependencies.
|
|
# Helm reads this for the `condition: certManager.enabled` flag only;
|
|
# the actual subchart values live below under the dep name `cert-manager`.
|
|
certManager:
|
|
enabled: true
|
|
# injectedWildcards — Slice 2B.3 (2026-05-04). Tower's per-tenant
|
|
# Vault-stash flow harvests successfully-issued wildcard cert
|
|
# Secrets and re-injects them on Reconnect to bypass Let's Encrypts
|
|
# 5-cert/identifier/168h rate limit. When an entry is present
|
|
# for a tenant.domains[].root, the chart:
|
|
# - SKIPS the cert-manager Certificate resource for that root
|
|
# (so no ACME order is placed)
|
|
# - Renders a kubernetes.io/tls Secret with the injected crt/key
|
|
# under the SAME name the cert-manager path would have used
|
|
# (`tenants-wildcard-tls` for primary, `tenants-wildcard-<root-
|
|
# with-dots-as-dashes>-tls` otherwise) so existing
|
|
# IngressRoutes don't need to change.
|
|
# Empty list = legacy ACME-only path. Per-domain — a tenant can
|
|
# mix injected + ACME-issued certs across multiple roots.
|
|
#
|
|
# Each entry shape:
|
|
# - root: "acme.com"
|
|
# - primary: true # mirrors tenant.domains[i].primary
|
|
# - crt: "<PEM cert chain>"
|
|
# - key: "<PEM private key>"
|
|
injectedWildcards: []
|
|
|
|
# cert-manager — values passed THROUGH to the upstream jetstack subchart
|
|
# (Chart.yaml dependency name = "cert-manager"). Subchart values must
|
|
# nest under the dep name, not under our top-level `certManager` alias —
|
|
# putting them under `certManager:` does nothing.
|
|
#
|
|
# crds.enabled — install the cert-manager CRDs in the same release. The
|
|
# v1.14+ jetstack chart renamed `installCRDs` to `crds.enabled`; the
|
|
# old key is silently ignored, leaving the CRDs absent and any
|
|
# Certificate / ClusterIssuer manifest failing with "no matches for kind".
|
|
# crds.keep — leave CRDs in place if the chart is uninstalled. Safer for
|
|
# disconnect flows where the customer might re-add the cluster later.
|
|
cert-manager:
|
|
crds:
|
|
enabled: false
|
|
keep: false # ignored when enabled=false
|
|
# startupapicheck — disabled (Slice 2B.1.2, 2026-05-04). The
|
|
# subchart includes a Job that runs as a PostSync hook and tries
|
|
# to verify cert-manager's API is responsive by issuing a test
|
|
# cert through it. Two real costs once cert-manager is proven on
|
|
# the platform:
|
|
# 1. The Job's PostSync hook gates Argo's sync from completing.
|
|
# On every chart sync (not just install), Argo waits for the
|
|
# Job to succeed before flipping the App to Synced.
|
|
# 2. When the wildcard Cert is in error (e.g. LE rate limit),
|
|
# the Job adds even more retry overhead — Argo loops forever.
|
|
# We're not adopting cert-manager fresh — every connect ships the
|
|
# same proven version, the install API surface is stable. The
|
|
# check is dead-weight that masks the actual install timing.
|
|
startupapicheck:
|
|
enabled: false
|
|
|
|
# traefik — upstream chart. LoadBalancer Service so the customer's
|
|
# k3s servicelb maps :80/:443 to the host. Tower currently doesn't
|
|
# rely on Traefik's IngressRoute features here; instances are on
|
|
# their own per-tenant Traefik later. This Traefik gives the cluster
|
|
# a default ingress for the registry + future platform endpoints.
|
|
traefik:
|
|
enabled: true
|
|
service:
|
|
type: LoadBalancer
|
|
# Platform-level HTTP → HTTPS redirect. Without this, browsers that
|
|
# default to http:// on a bare hostname hit Traefik's `web`
|
|
# entrypoint with no matching IngressRoute and get the built-in
|
|
# "404 page not found". Enabling redirectTo at the entrypoint makes
|
|
# every TCP-80 request bounce to TCP-443 with a 301 — applies
|
|
# uniformly to all IngressRoutes on this cluster, no per-instance
|
|
# Middleware or duplicate IngressRoute needed.
|
|
ports:
|
|
web:
|
|
redirectTo:
|
|
port: websecure
|
|
priority: 10
|
|
|
|
# secrets — DEPRECATED for cloudflareTokenSecret as of chart 0.7.3.
|
|
# The cluster-issuer.yaml template now hard-references
|
|
# `cloudflare-api-token-<tenant.slug>` (matches the ESO-created Secret
|
|
# in cloudflare-api-token-externalsecret.yaml) and ignores this block.
|
|
# Kept here as no-op back-compat for any external chart consumer that
|
|
# overrides these values; chart templates no longer read
|
|
# secrets.cloudflareTokenSecret. s3CredentialsSecret is still consumed
|
|
# by the per-instance backup CronJob path and remains live.
|
|
secrets:
|
|
cloudflareTokenSecret:
|
|
namespace: odoosky-system
|
|
name: cloudflare-api-token # unused since 0.7.3; chart computes from tenant.slug
|
|
key: api-token # unused since 0.7.3
|
|
s3CredentialsSecret:
|
|
namespace: tenants
|
|
name: s3-backup-creds
|
|
|
|
registry:
|
|
enabled: true
|
|
image:
|
|
repository: registry
|
|
tag: "2.8"
|
|
pullPolicy: IfNotPresent
|
|
# ClusterIP service hostname:
|
|
# registry.odoosky-system.svc.cluster.local:5000
|
|
# Used internally by build Jobs (push) and the Odoo Deployment's
|
|
# image volumes (pull). Plain HTTP — the registry never sees
|
|
# off-cluster traffic; node-side k3s registries.yaml whitelists
|
|
# the hostname for HTTP image pulls.
|
|
service:
|
|
port: 5000
|
|
# NodePort the kubelet on each node uses to reach the registry
|
|
# (via the host-side 127.0.0.1:<nodePort> mirror entry in
|
|
# /etc/rancher/k3s/registries.yaml). Picked outside the default
|
|
# 30000-32767 NodePort range's busy zone; change if the cluster
|
|
# already uses 30500 for something else.
|
|
nodePort: 30500
|
|
# Storage. The registry survives node restarts but is recreatable —
|
|
# if the PVC is wiped, Tower's ensureAddonImage will rebuild any
|
|
# missing images from Gitea source on demand. So we don't need a
|
|
# large or replicated PV here.
|
|
persistence:
|
|
enabled: false
|
|
size: 10Gi
|
|
storageClass: "" # "" = use the cluster's default; on k3s that's local-path
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 64Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 256Mi
|
|
|
|
# longhorn — CSI block storage. See ADR 0003 (odooskyv3 monorepo) for
|
|
# the full design. Phase 1 (this commit): declared but disabled.
|
|
# Per-server enablement happens via the per-cluster Argo Application's
|
|
# helm.parameters (set `longhorn.enabled=true`).
|
|
#
|
|
# Host prerequisites (already satisfied on bootstrap.sh-Connect'd
|
|
# servers): `open-iscsi` package + `iscsi_tcp` kernel module +
|
|
# `iscsid` service. Servers provisioned out-of-band must run
|
|
# `apt-get install -y open-iscsi && modprobe iscsi_tcp &&
|
|
# systemctl enable --now iscsid` before flipping enabled=true.
|
|
#
|
|
# When `longhorn.enabled=true`, the chart additionally renders:
|
|
# - StorageClass `longhorn-tenants` (replicaCount = .replicas)
|
|
# - VolumeSnapshotClass `longhorn-snapshot-class` for the future
|
|
# VolumeClone Refresh ↓ path
|
|
# Existing instances on `local-path` are unaffected — Longhorn
|
|
# co-exists, doesn't replace local-path.
|
|
# csiSnapshotter — vendored kubernetes-csi/external-snapshotter
|
|
# v8.1.0. Provides the standard `snapshot.storage.k8s.io/v1` CRDs
|
|
# + snapshot-controller. Required for Tower's CSI VolumeClone path
|
|
# (Refresh ↓ + spawn-env seed). See ADR 0003 phase 3.
|
|
#
|
|
# Only needed when Longhorn (or any other snapshot-capable CSI
|
|
# driver) is in use; default true so future server connects get the
|
|
# substrate ready out of the box.
|
|
csiSnapshotter:
|
|
enabled: true
|
|
|
|
longhorn:
|
|
enabled: false
|
|
# Replicas per Longhorn volume. Standard tier (single server) =
|
|
# 1 — durability story is async S3 backup, not local replicas.
|
|
# HA-Active sets this to 2 across the cluster's worker nodes.
|
|
replicas: 1
|
|
# Phase 5 of ADR 0003 — Longhorn's own settings, passed straight
|
|
# through to the subchart's `defaultSettings`. The two-layer design:
|
|
#
|
|
# 1. Local CoW snapshots (Longhorn `task: snapshot`) — instant,
|
|
# zero-blocking, hourly retention. Used for fast undo.
|
|
# 2. Async S3 backup (Longhorn `task: backup`) — block-incremental
|
|
# upload to tenant's bucket, gradual, never blocks workflow.
|
|
# Daily retention. The DR layer alongside Tower's existing
|
|
# application-level pg_dump backup (which is for cross-cluster
|
|
# migration; Longhorn-S3 is for fast same-cluster restore).
|
|
#
|
|
# The RecurringJob CRDs that drive both layers live in
|
|
# templates/longhorn-recurringjobs.yaml and bind to all volumes
|
|
# via the `default` group automatically.
|
|
defaultSettings:
|
|
defaultDataPath: /var/lib/longhorn
|
|
# backupTarget — set this per-server via the Argo App's helm
|
|
# parameters to enable the async S3 backup channel. Format:
|
|
# `s3://<bucket>@<region>/<prefix>/`. Empty = local snapshots
|
|
# only (local layer still works; just no off-cluster copy).
|
|
backupTarget: ""
|
|
# backupTargetCredentialSecret — name of K8s Secret in the
|
|
# `longhorn-system` namespace carrying AWS_ACCESS_KEY_ID +
|
|
# AWS_SECRET_ACCESS_KEY. Operator kubectl-applies it once per
|
|
# cluster (same pattern as cloudflare-api-token). Cross-namespace
|
|
# Secret references aren't allowed by Longhorn.
|
|
backupTargetCredentialSecret: ""
|
|
# Disable the Helm pre-upgrade checker Job. It's annotated as a
|
|
# `helm.sh/hook: pre-upgrade,pre-install` which Argo translates to
|
|
# PreSync — but the Job's ServiceAccount lives in the regular sync
|
|
# phase, so the Job fails ("ServiceAccount not found") before the
|
|
# SA gets created. Argo's sync model already gives us proper
|
|
# ordering on regular resources; the safety check is redundant.
|
|
preUpgradeChecker:
|
|
jobEnabled: false
|
|
upgradeVersionCheck: false
|
|
# Don't mark Longhorn's bundled StorageClass as cluster-default.
|
|
# k3s ships local-path as default; we keep it that way. New
|
|
# instances stay on local-path unless Tower explicitly stamps
|
|
# storageClassName=longhorn-tenants on their PVCs (Phase 6 of
|
|
# ADR 0003 will do that). Two `default` storage classes is a
|
|
# k8s misconfig — silently picks one for unscoped PVCs.
|
|
persistence:
|
|
defaultClass: false
|
|
defaultClassReplicaCount: 1
|
|
|
|
# externalSecrets — pilot delivery path for platform-scope Secrets
|
|
# previously kubectl-stamped by Tower. 2026-05-07 Phase 1 scope:
|
|
# `gitea-archive-pull` only. The other 4 Tower-stamped Secrets stay
|
|
# on the legacy path until a dedicated sprint (Item #9, v3 open
|
|
# queue).
|
|
#
|
|
# .openbao.mountPath is per-cluster — Tower passes this as a helm
|
|
# parameter so each tenant cluster's ESO authenticates against its
|
|
# own OpenBao auth mount (`auth/kubernetes-<cluster>`). Empty default
|
|
# means "off"; new clusters with no Tower wiring stay legacy.
|
|
externalSecrets:
|
|
enabled: true
|
|
openbao:
|
|
server: "https://vault.odoosky.org"
|
|
mountPath: "" # Tower fills this per-cluster, e.g. "kubernetes-customer1"
|
|
role: "eso"
|
|
|
|
# external-secrets — values passed THROUGH to the upstream subchart
|
|
# (Chart.yaml dependency name = "external-secrets"). CRDs install on
|
|
# first apply. Resource limits conservative — ESO is event-driven
|
|
# and idle most of the time.
|
|
#
|
|
# We keep the subchart's default release-prefixed naming
|
|
# (`<release>-external-secrets`) — i.e., we DON'T set
|
|
# fullnameOverride. The ClusterSecretStore manifest references the
|
|
# SA via `{{ .Release.Name }}-external-secrets` so the name resolves
|
|
# correctly per-cluster.
|
|
external-secrets:
|
|
installCRDs: true
|
|
|
|
# keda — event-driven autoscaler. Gate for the conditional in
|
|
# Chart.yaml dependencies. Enabled by default so all clusters can
|
|
# host AI Studio (per-instance OpenCode pods that scale 0↔1 via the
|
|
# HTTP add-on below). KEDA's control plane is ~50 MB RAM idle —
|
|
# negligible for clusters that never spawn a Studio.
|
|
#
|
|
# Subchart values pass through under the dep name (`keda:`) below.
|
|
keda:
|
|
enabled: true
|
|
# CRDs are pre-installed out-of-band on each cluster (kubectl
|
|
# apply --server-side from the chart's templates/crds/*.yaml). The
|
|
# in-chart helm install path is broken on ArgoCD: scaledjobs.keda.sh
|
|
# has a 581 KB schema that exceeds K8s' 262144-byte annotation
|
|
# limit when applied client-side. Server-side apply works at the
|
|
# CRD level but not when ArgoCD goes through its template-and-apply
|
|
# pipeline. The bootstrap step on cluster connect handles install.
|
|
# See docs/AI_STUDIO_ARCHITECTURE.md (TBD) for the full flow.
|
|
crds:
|
|
install: false
|
|
# operator + adapter + webhook — keep CPU/RAM modest. KEDA polls
|
|
# event sources every pollingInterval (default 30s); on a cluster
|
|
# with no ScaledObjects it does no work.
|
|
operator:
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 100Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 256Mi
|
|
metricsServer:
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 100Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 256Mi
|
|
webhooks:
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 50Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 128Mi
|
|
|
|
# kedaHttpAddon — gate for the HTTP add-on subchart. Same
|
|
# enabled-by-default rationale: idle interceptor is small, and only
|
|
# clusters with active Studios route traffic through it.
|
|
#
|
|
# The add-on installs the HTTPScaledObject CRD + an interceptor Service
|
|
# in the `keda` namespace at `keda-add-ons-http-interceptor-proxy`.
|
|
# Studio-template-v3's IngressRoute targets that interceptor by name
|
|
# (it figures out which Studio Deployment to wake by Host header
|
|
# matched against HTTPScaledObject.spec.hosts).
|
|
kedaHttpAddon:
|
|
enabled: true
|
|
|
|
# keda-add-ons-http — values passed THROUGH to the HTTP add-on
|
|
# subchart. The interceptor is the request-handling hot path; it
|
|
# buffers each cold-start request until the target pod is Ready.
|
|
# The scaler is the control loop watching HTTPScaledObject status.
|
|
keda-add-ons-http:
|
|
# HTTP add-on CRD (HTTPScaledObject) also pre-installed out-of-band
|
|
# for symmetry with the KEDA core CRDs above. The HTTP add-on chart
|
|
# itself doesn't have the annotation-size issue (its single CRD is
|
|
# small), but disabling chart-managed install keeps the operational
|
|
# contract uniform: 'CRDs are bootstrap, controllers are chart'.
|
|
crds:
|
|
install: false
|
|
# kube-rbac-proxy sidecar — upstream HTTP add-on 0.8.0 references
|
|
# gcr.io/kubebuilder/kube-rbac-proxy:v0.13.0 which was retired from
|
|
# gcr.io. Override to a current image mirrored to our registry
|
|
# (multi-arch preserved via crane copy from quay.io/brancz upstream).
|
|
# Without this override the controller-manager pod ImagePullBackOffs
|
|
# forever and HTTPScaledObjects never reconcile.
|
|
images:
|
|
kubeRbacProxy:
|
|
name: registry.odoosky.cloud/odoosky/docker-mirror/kube-rbac-proxy
|
|
tag: v0.18.0
|
|
interceptor:
|
|
replicas:
|
|
# Scale the interceptor itself with HPA on its own metrics —
|
|
# not zero (it must always be reachable to wake other pods).
|
|
# 1 replica is fine for OdooSky's per-customer-cluster load
|
|
# (single-digit Studios per cluster); upstream's own HPA
|
|
# handles bursts above that.
|
|
min: 1
|
|
max: 3
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 100Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 256Mi
|
|
scaler:
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 64Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 128Mi
|