0.5.0: Longhorn local snapshots + async S3 backup (#347 phase 5)
This commit is contained in:
@@ -23,8 +23,8 @@ description: |
|
||||
Git).
|
||||
|
||||
type: application
|
||||
version: 0.4.0
|
||||
appVersion: "0.4.0"
|
||||
version: 0.5.0
|
||||
appVersion: "0.5.0"
|
||||
|
||||
dependencies:
|
||||
- name: cert-manager
|
||||
|
||||
57
templates/longhorn-recurringjobs.yaml
Normal file
57
templates/longhorn-recurringjobs.yaml
Normal file
@@ -0,0 +1,57 @@
|
||||
{{- /*
|
||||
Phase 5 of ADR 0003 — two-layer protection for tenant volumes.
|
||||
|
||||
Layer A — local CoW snapshots (`task: snapshot`)
|
||||
Hourly. Instant. Zero blocking. Block-level CoW means the snapshot
|
||||
shares blocks with live data; only diverged writes consume new
|
||||
space. Cheap to keep ~24 hours of granular undo.
|
||||
|
||||
Layer B — async S3 backup (`task: backup`)
|
||||
Daily. Block-incremental. The customer's workflow never waits on
|
||||
the upload — Longhorn streams blocks to the configured S3 target
|
||||
in the background. Renders the cluster's data durable off-cluster
|
||||
even if Layer A snapshots are wiped (e.g. server reformat).
|
||||
|
||||
Both layers are independent of Tower's existing application-level
|
||||
pg_dump backup. The application backup captures higher-level
|
||||
semantic state (schema-aware, restorable to a different PG major
|
||||
or cluster) and is what makes cross-cluster migration possible.
|
||||
The Longhorn layers capture block-level state and are what makes
|
||||
fast same-cluster restore possible. Both run; the customer keeps
|
||||
both. Decision 0002 (Standard tier ships always-on durable backup)
|
||||
is satisfied by the application layer alone; Longhorn-S3 is the
|
||||
velocity-and-redundancy upgrade.
|
||||
|
||||
Both jobs target Longhorn's `default` group, which auto-includes
|
||||
every volume with no explicit recurring-job reference. So the
|
||||
schedule applies to existing AND future tenant PVCs without
|
||||
operator action per-instance.
|
||||
*/ -}}
|
||||
{{- if .Values.longhorn.enabled }}
|
||||
---
|
||||
apiVersion: longhorn.io/v1beta2
|
||||
kind: RecurringJob
|
||||
metadata:
|
||||
name: tenants-snapshot-hourly
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
cron: "0 * * * *"
|
||||
task: snapshot
|
||||
groups: [default]
|
||||
retain: 24
|
||||
concurrency: 2
|
||||
{{- if .Values.longhorn.defaultSettings.backupTarget }}
|
||||
---
|
||||
apiVersion: longhorn.io/v1beta2
|
||||
kind: RecurringJob
|
||||
metadata:
|
||||
name: tenants-backup-daily
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
cron: "0 3 * * *"
|
||||
task: backup
|
||||
groups: [default]
|
||||
retain: 7
|
||||
concurrency: 2
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
43
values.yaml
43
values.yaml
@@ -144,23 +144,36 @@ csiSnapshotter:
|
||||
longhorn:
|
||||
enabled: false
|
||||
# Replicas per Longhorn volume. Standard tier (single server) =
|
||||
# 1 — durability story is hourly S3 backup, not local replicas.
|
||||
# 1 — durability story is async S3 backup, not local replicas.
|
||||
# HA-Active sets this to 2 across the cluster's worker nodes.
|
||||
replicas: 1
|
||||
# Default data path. k3s nodes get `/var/lib/longhorn` by default;
|
||||
# production servers may want this on a separate disk for IOPS
|
||||
# isolation from the OS root volume.
|
||||
defaultDataPath: /var/lib/longhorn
|
||||
# S3 backup target for Longhorn's own block-level backups (DR
|
||||
# layer alongside Tower's application-level pg_dump path). When
|
||||
# set, Longhorn writes block-incremental backups to this prefix
|
||||
# daily. Empty = block-level backup disabled, application backup
|
||||
# only.
|
||||
backupTarget: ""
|
||||
# Same S3 secret Tower's application backup already uses.
|
||||
backupCredsSecret:
|
||||
namespace: tenants
|
||||
name: s3-backup-creds
|
||||
# Phase 5 of ADR 0003 — Longhorn's own settings, passed straight
|
||||
# through to the subchart's `defaultSettings`. The two-layer design:
|
||||
#
|
||||
# 1. Local CoW snapshots (Longhorn `task: snapshot`) — instant,
|
||||
# zero-blocking, hourly retention. Used for fast undo.
|
||||
# 2. Async S3 backup (Longhorn `task: backup`) — block-incremental
|
||||
# upload to tenant's bucket, gradual, never blocks workflow.
|
||||
# Daily retention. The DR layer alongside Tower's existing
|
||||
# application-level pg_dump backup (which is for cross-cluster
|
||||
# migration; Longhorn-S3 is for fast same-cluster restore).
|
||||
#
|
||||
# The RecurringJob CRDs that drive both layers live in
|
||||
# templates/longhorn-recurringjobs.yaml and bind to all volumes
|
||||
# via the `default` group automatically.
|
||||
defaultSettings:
|
||||
defaultDataPath: /var/lib/longhorn
|
||||
# backupTarget — set this per-server via the Argo App's helm
|
||||
# parameters to enable the async S3 backup channel. Format:
|
||||
# `s3://<bucket>@<region>/<prefix>/`. Empty = local snapshots
|
||||
# only (local layer still works; just no off-cluster copy).
|
||||
backupTarget: ""
|
||||
# backupTargetCredentialSecret — name of K8s Secret in the
|
||||
# `longhorn-system` namespace carrying AWS_ACCESS_KEY_ID +
|
||||
# AWS_SECRET_ACCESS_KEY. Operator kubectl-applies it once per
|
||||
# cluster (same pattern as cloudflare-api-token). Cross-namespace
|
||||
# Secret references aren't allowed by Longhorn.
|
||||
backupTargetCredentialSecret: ""
|
||||
# Disable the Helm pre-upgrade checker Job. It's annotated as a
|
||||
# `helm.sh/hook: pre-upgrade,pre-install` which Argo translates to
|
||||
# PreSync — but the Job's ServiceAccount lives in the regular sync
|
||||
|
||||
Reference in New Issue
Block a user