From 3e642dd7a1f2d1bc28462e1e3b7321b44dcf42c9 Mon Sep 17 00:00:00 2001 From: OdooSky Bot Date: Sat, 2 May 2026 23:14:15 +0300 Subject: [PATCH] 0.5.0: Longhorn local snapshots + async S3 backup (#347 phase 5) --- Chart.yaml | 4 +- templates/longhorn-recurringjobs.yaml | 57 +++++++++++++++++++++++++++ values.yaml | 43 +++++++++++++------- 3 files changed, 87 insertions(+), 17 deletions(-) create mode 100644 templates/longhorn-recurringjobs.yaml diff --git a/Chart.yaml b/Chart.yaml index b80dfaf..eb9a70f 100644 --- a/Chart.yaml +++ b/Chart.yaml @@ -23,8 +23,8 @@ description: | Git). type: application -version: 0.4.0 -appVersion: "0.4.0" +version: 0.5.0 +appVersion: "0.5.0" dependencies: - name: cert-manager diff --git a/templates/longhorn-recurringjobs.yaml b/templates/longhorn-recurringjobs.yaml new file mode 100644 index 0000000..bce8889 --- /dev/null +++ b/templates/longhorn-recurringjobs.yaml @@ -0,0 +1,57 @@ +{{- /* +Phase 5 of ADR 0003 — two-layer protection for tenant volumes. + +Layer A — local CoW snapshots (`task: snapshot`) + Hourly. Instant. Zero blocking. Block-level CoW means the snapshot + shares blocks with live data; only diverged writes consume new + space. Cheap to keep ~24 hours of granular undo. + +Layer B — async S3 backup (`task: backup`) + Daily. Block-incremental. The customer's workflow never waits on + the upload — Longhorn streams blocks to the configured S3 target + in the background. Renders the cluster's data durable off-cluster + even if Layer A snapshots are wiped (e.g. server reformat). + +Both layers are independent of Tower's existing application-level +pg_dump backup. The application backup captures higher-level +semantic state (schema-aware, restorable to a different PG major +or cluster) and is what makes cross-cluster migration possible. +The Longhorn layers capture block-level state and are what makes +fast same-cluster restore possible. Both run; the customer keeps +both. Decision 0002 (Standard tier ships always-on durable backup) +is satisfied by the application layer alone; Longhorn-S3 is the +velocity-and-redundancy upgrade. + +Both jobs target Longhorn's `default` group, which auto-includes +every volume with no explicit recurring-job reference. So the +schedule applies to existing AND future tenant PVCs without +operator action per-instance. +*/ -}} +{{- if .Values.longhorn.enabled }} +--- +apiVersion: longhorn.io/v1beta2 +kind: RecurringJob +metadata: + name: tenants-snapshot-hourly + namespace: longhorn-system +spec: + cron: "0 * * * *" + task: snapshot + groups: [default] + retain: 24 + concurrency: 2 +{{- if .Values.longhorn.defaultSettings.backupTarget }} +--- +apiVersion: longhorn.io/v1beta2 +kind: RecurringJob +metadata: + name: tenants-backup-daily + namespace: longhorn-system +spec: + cron: "0 3 * * *" + task: backup + groups: [default] + retain: 7 + concurrency: 2 +{{- end }} +{{- end }} diff --git a/values.yaml b/values.yaml index ce3d908..12c99eb 100644 --- a/values.yaml +++ b/values.yaml @@ -144,23 +144,36 @@ csiSnapshotter: longhorn: enabled: false # Replicas per Longhorn volume. Standard tier (single server) = - # 1 — durability story is hourly S3 backup, not local replicas. + # 1 — durability story is async S3 backup, not local replicas. # HA-Active sets this to 2 across the cluster's worker nodes. replicas: 1 - # Default data path. k3s nodes get `/var/lib/longhorn` by default; - # production servers may want this on a separate disk for IOPS - # isolation from the OS root volume. - defaultDataPath: /var/lib/longhorn - # S3 backup target for Longhorn's own block-level backups (DR - # layer alongside Tower's application-level pg_dump path). When - # set, Longhorn writes block-incremental backups to this prefix - # daily. Empty = block-level backup disabled, application backup - # only. - backupTarget: "" - # Same S3 secret Tower's application backup already uses. - backupCredsSecret: - namespace: tenants - name: s3-backup-creds + # Phase 5 of ADR 0003 — Longhorn's own settings, passed straight + # through to the subchart's `defaultSettings`. The two-layer design: + # + # 1. Local CoW snapshots (Longhorn `task: snapshot`) — instant, + # zero-blocking, hourly retention. Used for fast undo. + # 2. Async S3 backup (Longhorn `task: backup`) — block-incremental + # upload to tenant's bucket, gradual, never blocks workflow. + # Daily retention. The DR layer alongside Tower's existing + # application-level pg_dump backup (which is for cross-cluster + # migration; Longhorn-S3 is for fast same-cluster restore). + # + # The RecurringJob CRDs that drive both layers live in + # templates/longhorn-recurringjobs.yaml and bind to all volumes + # via the `default` group automatically. + defaultSettings: + defaultDataPath: /var/lib/longhorn + # backupTarget — set this per-server via the Argo App's helm + # parameters to enable the async S3 backup channel. Format: + # `s3://@//`. Empty = local snapshots + # only (local layer still works; just no off-cluster copy). + backupTarget: "" + # backupTargetCredentialSecret — name of K8s Secret in the + # `longhorn-system` namespace carrying AWS_ACCESS_KEY_ID + + # AWS_SECRET_ACCESS_KEY. Operator kubectl-applies it once per + # cluster (same pattern as cloudflare-api-token). Cross-namespace + # Secret references aren't allowed by Longhorn. + backupTargetCredentialSecret: "" # Disable the Helm pre-upgrade checker Job. It's annotated as a # `helm.sh/hook: pre-upgrade,pre-install` which Argo translates to # PreSync — but the Job's ServiceAccount lives in the regular sync