0.5.0: Longhorn local snapshots + async S3 backup (#347 phase 5)

2026-05-02 23:14:15 +03:00
parent 8fca9aadfa
commit 3e642dd7a1
3 changed files with 87 additions and 17 deletions
--- a/Chart.yaml
+++ b/Chart.yaml
@@ -23,8 +23,8 @@ description: |
      Git).
 type: application
-version: 0.4.0
+version: 0.5.0
-appVersion: "0.4.0"
+appVersion: "0.5.0"
 dependencies:
  - name: cert-manager
--- a/templates/longhorn-recurringjobs.yaml
+++ b/templates/longhorn-recurringjobs.yaml
@@ -0,0 +1,57 @@
 {{- /*
 Phase 5 of ADR 0003 — two-layer protection for tenant volumes.
 Layer A — local CoW snapshots (`task: snapshot`)
  Hourly. Instant. Zero blocking. Block-level CoW means the snapshot
  shares blocks with live data; only diverged writes consume new
  space. Cheap to keep ~24 hours of granular undo.
 Layer B — async S3 backup (`task: backup`)
  Daily. Block-incremental. The customer's workflow never waits on
  the upload — Longhorn streams blocks to the configured S3 target
  in the background. Renders the cluster's data durable off-cluster
  even if Layer A snapshots are wiped (e.g. server reformat).
 Both layers are independent of Tower's existing application-level
 pg_dump backup. The application backup captures higher-level
 semantic state (schema-aware, restorable to a different PG major
 or cluster) and is what makes cross-cluster migration possible.
 The Longhorn layers capture block-level state and are what makes
 fast same-cluster restore possible. Both run; the customer keeps
 both. Decision 0002 (Standard tier ships always-on durable backup)
 is satisfied by the application layer alone; Longhorn-S3 is the
 velocity-and-redundancy upgrade.
 Both jobs target Longhorn's `default` group, which auto-includes
 every volume with no explicit recurring-job reference. So the
 schedule applies to existing AND future tenant PVCs without
 operator action per-instance.
 */ -}}
 {{- if .Values.longhorn.enabled }}
 ---
 apiVersion: longhorn.io/v1beta2
 kind: RecurringJob
 metadata:
  name: tenants-snapshot-hourly
  namespace: longhorn-system
 spec:
  cron: "0 * * * *"
  task: snapshot
  groups: [default]
  retain: 24
  concurrency: 2
 {{- if .Values.longhorn.defaultSettings.backupTarget }}
 ---
 apiVersion: longhorn.io/v1beta2
 kind: RecurringJob
 metadata:
  name: tenants-backup-daily
  namespace: longhorn-system
 spec:
  cron: "0 3 * * *"
  task: backup
  groups: [default]
  retain: 7
  concurrency: 2
 {{- end }}
 {{- end }}
--- a/values.yaml
+++ b/values.yaml
@@ -144,23 +144,36 @@ csiSnapshotter:
 longhorn:
  enabled: false
  # Replicas per Longhorn volume. Standard tier (single server) =
-  # 1 — durability story is hourly S3 backup, not local replicas.
+  # 1 — durability story is async S3 backup, not local replicas.
  # HA-Active sets this to 2 across the cluster's worker nodes.
  replicas: 1
-  # Default data path. k3s nodes get `/var/lib/longhorn` by default;
+  # Phase 5 of ADR 0003 — Longhorn's own settings, passed straight
-  # production servers may want this on a separate disk for IOPS
+  # through to the subchart's `defaultSettings`. The two-layer design:
-  # isolation from the OS root volume.
+  #
-  defaultDataPath: /var/lib/longhorn
+  #   1. Local CoW snapshots (Longhorn `task: snapshot`) — instant,
-  # S3 backup target for Longhorn's own block-level backups (DR
+  #      zero-blocking, hourly retention. Used for fast undo.
-  # layer alongside Tower's application-level pg_dump path). When
+  #   2. Async S3 backup (Longhorn `task: backup`) — block-incremental
-  # set, Longhorn writes block-incremental backups to this prefix
+  #      upload to tenant's bucket, gradual, never blocks workflow.
-  # daily. Empty = block-level backup disabled, application backup
+  #      Daily retention. The DR layer alongside Tower's existing
-  # only.
+  #      application-level pg_dump backup (which is for cross-cluster
-  backupTarget: ""
+  #      migration; Longhorn-S3 is for fast same-cluster restore).
-  # Same S3 secret Tower's application backup already uses.
+  #
-  backupCredsSecret:
+  # The RecurringJob CRDs that drive both layers live in
-    namespace: tenants
+  # templates/longhorn-recurringjobs.yaml and bind to all volumes
-    name: s3-backup-creds
+  # via the `default` group automatically.
  defaultSettings:
    defaultDataPath: /var/lib/longhorn
    # backupTarget — set this per-server via the Argo App's helm
    # parameters to enable the async S3 backup channel. Format:
    # `s3://<bucket>@<region>/<prefix>/`. Empty = local snapshots
    # only (local layer still works; just no off-cluster copy).
    backupTarget: ""
    # backupTargetCredentialSecret — name of K8s Secret in the
    # `longhorn-system` namespace carrying AWS_ACCESS_KEY_ID +
    # AWS_SECRET_ACCESS_KEY. Operator kubectl-applies it once per
    # cluster (same pattern as cloudflare-api-token). Cross-namespace
    # Secret references aren't allowed by Longhorn.
    backupTargetCredentialSecret: ""
  # Disable the Helm pre-upgrade checker Job. It's annotated as a
  # `helm.sh/hook: pre-upgrade,pre-install` which Argo translates to
  # PreSync — but the Job's ServiceAccount lives in the regular sync