0.5.0: Longhorn local snapshots + async S3 backup (#347 phase 5)
This commit is contained in:
@@ -23,8 +23,8 @@ description: |
|
|||||||
Git).
|
Git).
|
||||||
|
|
||||||
type: application
|
type: application
|
||||||
version: 0.4.0
|
version: 0.5.0
|
||||||
appVersion: "0.4.0"
|
appVersion: "0.5.0"
|
||||||
|
|
||||||
dependencies:
|
dependencies:
|
||||||
- name: cert-manager
|
- name: cert-manager
|
||||||
|
|||||||
57
templates/longhorn-recurringjobs.yaml
Normal file
57
templates/longhorn-recurringjobs.yaml
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
{{- /*
|
||||||
|
Phase 5 of ADR 0003 — two-layer protection for tenant volumes.
|
||||||
|
|
||||||
|
Layer A — local CoW snapshots (`task: snapshot`)
|
||||||
|
Hourly. Instant. Zero blocking. Block-level CoW means the snapshot
|
||||||
|
shares blocks with live data; only diverged writes consume new
|
||||||
|
space. Cheap to keep ~24 hours of granular undo.
|
||||||
|
|
||||||
|
Layer B — async S3 backup (`task: backup`)
|
||||||
|
Daily. Block-incremental. The customer's workflow never waits on
|
||||||
|
the upload — Longhorn streams blocks to the configured S3 target
|
||||||
|
in the background. Renders the cluster's data durable off-cluster
|
||||||
|
even if Layer A snapshots are wiped (e.g. server reformat).
|
||||||
|
|
||||||
|
Both layers are independent of Tower's existing application-level
|
||||||
|
pg_dump backup. The application backup captures higher-level
|
||||||
|
semantic state (schema-aware, restorable to a different PG major
|
||||||
|
or cluster) and is what makes cross-cluster migration possible.
|
||||||
|
The Longhorn layers capture block-level state and are what makes
|
||||||
|
fast same-cluster restore possible. Both run; the customer keeps
|
||||||
|
both. Decision 0002 (Standard tier ships always-on durable backup)
|
||||||
|
is satisfied by the application layer alone; Longhorn-S3 is the
|
||||||
|
velocity-and-redundancy upgrade.
|
||||||
|
|
||||||
|
Both jobs target Longhorn's `default` group, which auto-includes
|
||||||
|
every volume with no explicit recurring-job reference. So the
|
||||||
|
schedule applies to existing AND future tenant PVCs without
|
||||||
|
operator action per-instance.
|
||||||
|
*/ -}}
|
||||||
|
{{- if .Values.longhorn.enabled }}
|
||||||
|
---
|
||||||
|
apiVersion: longhorn.io/v1beta2
|
||||||
|
kind: RecurringJob
|
||||||
|
metadata:
|
||||||
|
name: tenants-snapshot-hourly
|
||||||
|
namespace: longhorn-system
|
||||||
|
spec:
|
||||||
|
cron: "0 * * * *"
|
||||||
|
task: snapshot
|
||||||
|
groups: [default]
|
||||||
|
retain: 24
|
||||||
|
concurrency: 2
|
||||||
|
{{- if .Values.longhorn.defaultSettings.backupTarget }}
|
||||||
|
---
|
||||||
|
apiVersion: longhorn.io/v1beta2
|
||||||
|
kind: RecurringJob
|
||||||
|
metadata:
|
||||||
|
name: tenants-backup-daily
|
||||||
|
namespace: longhorn-system
|
||||||
|
spec:
|
||||||
|
cron: "0 3 * * *"
|
||||||
|
task: backup
|
||||||
|
groups: [default]
|
||||||
|
retain: 7
|
||||||
|
concurrency: 2
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
39
values.yaml
39
values.yaml
@@ -144,23 +144,36 @@ csiSnapshotter:
|
|||||||
longhorn:
|
longhorn:
|
||||||
enabled: false
|
enabled: false
|
||||||
# Replicas per Longhorn volume. Standard tier (single server) =
|
# Replicas per Longhorn volume. Standard tier (single server) =
|
||||||
# 1 — durability story is hourly S3 backup, not local replicas.
|
# 1 — durability story is async S3 backup, not local replicas.
|
||||||
# HA-Active sets this to 2 across the cluster's worker nodes.
|
# HA-Active sets this to 2 across the cluster's worker nodes.
|
||||||
replicas: 1
|
replicas: 1
|
||||||
# Default data path. k3s nodes get `/var/lib/longhorn` by default;
|
# Phase 5 of ADR 0003 — Longhorn's own settings, passed straight
|
||||||
# production servers may want this on a separate disk for IOPS
|
# through to the subchart's `defaultSettings`. The two-layer design:
|
||||||
# isolation from the OS root volume.
|
#
|
||||||
|
# 1. Local CoW snapshots (Longhorn `task: snapshot`) — instant,
|
||||||
|
# zero-blocking, hourly retention. Used for fast undo.
|
||||||
|
# 2. Async S3 backup (Longhorn `task: backup`) — block-incremental
|
||||||
|
# upload to tenant's bucket, gradual, never blocks workflow.
|
||||||
|
# Daily retention. The DR layer alongside Tower's existing
|
||||||
|
# application-level pg_dump backup (which is for cross-cluster
|
||||||
|
# migration; Longhorn-S3 is for fast same-cluster restore).
|
||||||
|
#
|
||||||
|
# The RecurringJob CRDs that drive both layers live in
|
||||||
|
# templates/longhorn-recurringjobs.yaml and bind to all volumes
|
||||||
|
# via the `default` group automatically.
|
||||||
|
defaultSettings:
|
||||||
defaultDataPath: /var/lib/longhorn
|
defaultDataPath: /var/lib/longhorn
|
||||||
# S3 backup target for Longhorn's own block-level backups (DR
|
# backupTarget — set this per-server via the Argo App's helm
|
||||||
# layer alongside Tower's application-level pg_dump path). When
|
# parameters to enable the async S3 backup channel. Format:
|
||||||
# set, Longhorn writes block-incremental backups to this prefix
|
# `s3://<bucket>@<region>/<prefix>/`. Empty = local snapshots
|
||||||
# daily. Empty = block-level backup disabled, application backup
|
# only (local layer still works; just no off-cluster copy).
|
||||||
# only.
|
|
||||||
backupTarget: ""
|
backupTarget: ""
|
||||||
# Same S3 secret Tower's application backup already uses.
|
# backupTargetCredentialSecret — name of K8s Secret in the
|
||||||
backupCredsSecret:
|
# `longhorn-system` namespace carrying AWS_ACCESS_KEY_ID +
|
||||||
namespace: tenants
|
# AWS_SECRET_ACCESS_KEY. Operator kubectl-applies it once per
|
||||||
name: s3-backup-creds
|
# cluster (same pattern as cloudflare-api-token). Cross-namespace
|
||||||
|
# Secret references aren't allowed by Longhorn.
|
||||||
|
backupTargetCredentialSecret: ""
|
||||||
# Disable the Helm pre-upgrade checker Job. It's annotated as a
|
# Disable the Helm pre-upgrade checker Job. It's annotated as a
|
||||||
# `helm.sh/hook: pre-upgrade,pre-install` which Argo translates to
|
# `helm.sh/hook: pre-upgrade,pre-install` which Argo translates to
|
||||||
# PreSync — but the Job's ServiceAccount lives in the regular sync
|
# PreSync — but the Job's ServiceAccount lives in the regular sync
|
||||||
|
|||||||
Reference in New Issue
Block a user