feat(devops): CI/CD + Docker + Monitoring infrastructure
- Multi-stage Dockerfile (builder+runner, <500MB target) - docker-compose.yml: app(x4) + postgres + redis + prometheus + grafana + nginx - .env.example with all required secrets (never hardcoded) - requirements.txt with all dependencies including prometheus-client, alembic - GitHub Actions CI: lint (flake8+bandit+safety) + tests + Docker build/push - GitHub Actions CD: staging deploy -> smoke tests -> production deploy + rollback - Alembic migration setup + initial PostgreSQL schema (001_initial_schema) - SQLite→PostgreSQL data migration script - Prometheus metrics module (HTTP, ML, DB, business metrics) - Prometheus alert rules (5xx >1%, latency >2s, disk >80%, ML accuracy) - Grafana dashboard (overview: req/s, p95, ML accuracy, error rate) - Nginx reverse proxy config (HTTPS/TLS, rate limiting, security headers) - Structured JSON logging module - Automated daily DB backup script (pg_dump + 30-day retention) Branch: feature/devops-cicd Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
174
infra/grafana/dashboards/turf-saas-overview.json
Normal file
174
infra/grafana/dashboards/turf-saas-overview.json
Normal file
@@ -0,0 +1,174 @@
|
||||
{
|
||||
"title": "Turf SaaS — Overview",
|
||||
"uid": "turf-saas-overview",
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"tags": ["turf-saas"],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Request Rate (req/s)",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"expr": "sum(rate(http_requests_total[5m]))",
|
||||
"legendFormat": "req/s"
|
||||
}
|
||||
],
|
||||
"options": { "colorMode": "background", "graphMode": "area" }
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Error Rate (5xx)",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
|
||||
"legendFormat": "error %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "p95 Latency",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "ML Top-1 Accuracy",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"expr": "ml_prediction_accuracy_ratio{accuracy_type=\"top1\"} * 100",
|
||||
"legendFormat": "top-1 %"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 25 },
|
||||
{ "color": "green", "value": 35 }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "timeseries",
|
||||
"title": "HTTP Requests by Service",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"expr": "sum(rate(http_requests_total[5m])) by (service)",
|
||||
"legendFormat": "{{ service }}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "reqps" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "timeseries",
|
||||
"title": "Request Duration p50/p95/p99",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "timeseries",
|
||||
"title": "ML Predictions per Hour",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"expr": "sum(increase(ml_predictions_total[1h])) by (model_type)",
|
||||
"legendFormat": "{{ model_type }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "timeseries",
|
||||
"title": "DB Query Duration",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "Prometheus",
|
||||
"expr": "histogram_quantile(0.95, sum(rate(db_query_duration_seconds_bucket[5m])) by (le, operation))",
|
||||
"legendFormat": "{{ operation }} p95"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s" }
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
11
infra/grafana/provisioning/dashboards/dashboards.yml
Normal file
11
infra/grafana/provisioning/dashboards/dashboards.yml
Normal file
@@ -0,0 +1,11 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: turf-saas-dashboards
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 30
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
foldersFromFilesStructure: true
|
||||
13
infra/grafana/provisioning/datasources/prometheus.yml
Normal file
13
infra/grafana/provisioning/datasources/prometheus.yml
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
uid: prometheus-main
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
timeInterval: "15s"
|
||||
157
infra/nginx/conf.d/turf.conf
Normal file
157
infra/nginx/conf.d/turf.conf
Normal file
@@ -0,0 +1,157 @@
|
||||
# ============================================================
|
||||
# Nginx Virtual Host — Turf SaaS
|
||||
# ============================================================
|
||||
|
||||
# Upstream service pools
|
||||
upstream combined_api {
|
||||
server combined-api:8790;
|
||||
keepalive 32;
|
||||
}
|
||||
|
||||
upstream dashboard_api {
|
||||
server dashboard-api:8791;
|
||||
keepalive 16;
|
||||
}
|
||||
|
||||
upstream portal {
|
||||
server portal:8792;
|
||||
keepalive 16;
|
||||
}
|
||||
|
||||
upstream grafana {
|
||||
server grafana:3000;
|
||||
keepalive 4;
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# HTTP → HTTPS redirect
|
||||
# ----------------------------------------------------------
|
||||
server {
|
||||
listen 80;
|
||||
server_name _;
|
||||
|
||||
# Let's Encrypt ACME challenge
|
||||
location /.well-known/acme-challenge/ {
|
||||
root /var/www/certbot;
|
||||
}
|
||||
|
||||
location / {
|
||||
return 301 https://$host$request_uri;
|
||||
}
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# HTTPS main server
|
||||
# ----------------------------------------------------------
|
||||
server {
|
||||
listen 443 ssl;
|
||||
http2 on;
|
||||
server_name ${DOMAIN};
|
||||
|
||||
# TLS configuration
|
||||
ssl_certificate /etc/letsencrypt/live/${DOMAIN}/fullchain.pem;
|
||||
ssl_certificate_key /etc/letsencrypt/live/${DOMAIN}/privkey.pem;
|
||||
ssl_session_cache shared:SSL:10m;
|
||||
ssl_session_timeout 10m;
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384;
|
||||
ssl_prefer_server_ciphers on;
|
||||
ssl_stapling on;
|
||||
ssl_stapling_verify on;
|
||||
|
||||
# Security headers
|
||||
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
|
||||
add_header X-Frame-Options DENY always;
|
||||
add_header X-Content-Type-Options nosniff always;
|
||||
add_header X-XSS-Protection "1; mode=block" always;
|
||||
add_header Referrer-Policy strict-origin-when-cross-origin always;
|
||||
add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; img-src 'self' data:;" always;
|
||||
|
||||
# Limits
|
||||
client_max_body_size 10M;
|
||||
limit_conn conn_limit 20;
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Portal (root)
|
||||
# ----------------------------------------------------------
|
||||
location / {
|
||||
limit_req zone=global burst=50 nodelay;
|
||||
proxy_pass http://portal;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
proxy_set_header Connection "";
|
||||
proxy_read_timeout 60s;
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Combined API
|
||||
# ----------------------------------------------------------
|
||||
location /api/ {
|
||||
limit_req zone=api burst=20 nodelay;
|
||||
proxy_pass http://combined_api;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
proxy_set_header Connection "";
|
||||
proxy_read_timeout 120s;
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Dashboard API
|
||||
# ----------------------------------------------------------
|
||||
location /dashboard-api/ {
|
||||
limit_req zone=api burst=20 nodelay;
|
||||
proxy_pass http://dashboard_api/;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
proxy_set_header Connection "";
|
||||
proxy_read_timeout 120s;
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Grafana (restricted to internal/admin)
|
||||
# ----------------------------------------------------------
|
||||
location /grafana/ {
|
||||
# Restrict to admin IPs in production
|
||||
# allow 10.0.0.0/8;
|
||||
# deny all;
|
||||
|
||||
proxy_pass http://grafana;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
proxy_set_header Connection "";
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Health check (no rate limiting)
|
||||
# ----------------------------------------------------------
|
||||
location /health {
|
||||
proxy_pass http://combined_api/health;
|
||||
proxy_http_version 1.1;
|
||||
access_log off;
|
||||
}
|
||||
|
||||
# Block common attack vectors
|
||||
location ~ /\. {
|
||||
deny all;
|
||||
access_log off;
|
||||
log_not_found off;
|
||||
}
|
||||
|
||||
location ~* \.(env|git|bak|sql|log)$ {
|
||||
deny all;
|
||||
access_log off;
|
||||
log_not_found off;
|
||||
}
|
||||
}
|
||||
65
infra/nginx/nginx.conf
Normal file
65
infra/nginx/nginx.conf
Normal file
@@ -0,0 +1,65 @@
|
||||
# ============================================================
|
||||
# Nginx — Main config
|
||||
# ============================================================
|
||||
|
||||
user nginx;
|
||||
worker_processes auto;
|
||||
error_log /var/log/nginx/error.log warn;
|
||||
pid /var/run/nginx.pid;
|
||||
|
||||
events {
|
||||
worker_connections 1024;
|
||||
use epoll;
|
||||
multi_accept on;
|
||||
}
|
||||
|
||||
http {
|
||||
include /etc/nginx/mime.types;
|
||||
default_type application/octet-stream;
|
||||
|
||||
# Logging
|
||||
log_format json_combined escape=json
|
||||
'{"time":"$time_iso8601",'
|
||||
'"remote_addr":"$remote_addr",'
|
||||
'"method":"$request_method",'
|
||||
'"uri":"$request_uri",'
|
||||
'"status":$status,'
|
||||
'"body_bytes":$body_bytes_sent,'
|
||||
'"duration":$request_time,'
|
||||
'"referrer":"$http_referer",'
|
||||
'"user_agent":"$http_user_agent",'
|
||||
'"x_forwarded_for":"$http_x_forwarded_for"}';
|
||||
|
||||
access_log /var/log/nginx/access.log json_combined;
|
||||
|
||||
# Performance
|
||||
sendfile on;
|
||||
tcp_nopush on;
|
||||
tcp_nodelay on;
|
||||
keepalive_timeout 65;
|
||||
types_hash_max_size 2048;
|
||||
server_tokens off;
|
||||
|
||||
# Gzip
|
||||
gzip on;
|
||||
gzip_vary on;
|
||||
gzip_min_length 1024;
|
||||
gzip_proxied any;
|
||||
gzip_comp_level 5;
|
||||
gzip_types
|
||||
text/plain
|
||||
text/css
|
||||
text/javascript
|
||||
application/javascript
|
||||
application/json
|
||||
application/xml
|
||||
image/svg+xml;
|
||||
|
||||
# Rate limiting zones
|
||||
limit_req_zone $binary_remote_addr zone=api:10m rate=30r/m;
|
||||
limit_req_zone $binary_remote_addr zone=global:20m rate=100r/m;
|
||||
limit_conn_zone $binary_remote_addr zone=conn_limit:10m;
|
||||
|
||||
# Include virtual hosts
|
||||
include /etc/nginx/conf.d/*.conf;
|
||||
}
|
||||
12
infra/postgres/init.sql
Normal file
12
infra/postgres/init.sql
Normal file
@@ -0,0 +1,12 @@
|
||||
-- ============================================================
|
||||
-- PostgreSQL init script for Turf SaaS
|
||||
-- Runs on first container start (docker-entrypoint-initdb.d)
|
||||
-- ============================================================
|
||||
|
||||
-- Create extensions
|
||||
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
||||
CREATE EXTENSION IF NOT EXISTS "pg_trgm";
|
||||
|
||||
-- Grant privileges to the app user
|
||||
GRANT ALL PRIVILEGES ON DATABASE turf_saas TO turf;
|
||||
GRANT ALL ON SCHEMA public TO turf;
|
||||
109
infra/prometheus/alerts.yml
Normal file
109
infra/prometheus/alerts.yml
Normal file
@@ -0,0 +1,109 @@
|
||||
# ============================================================
|
||||
# Prometheus Alert Rules — Turf SaaS
|
||||
# ============================================================
|
||||
|
||||
groups:
|
||||
# ----------------------------------------------------------
|
||||
# HTTP / API Alerts
|
||||
# ----------------------------------------------------------
|
||||
- name: http_alerts
|
||||
rules:
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
sum(rate(http_requests_total{status_code=~"5.."}[5m])) by (service)
|
||||
/
|
||||
sum(rate(http_requests_total[5m])) by (service)
|
||||
> 0.01
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High 5xx error rate on {{ $labels.service }}"
|
||||
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
|
||||
|
||||
- alert: HighLatency
|
||||
expr: |
|
||||
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service))
|
||||
> 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High p95 latency on {{ $labels.service }}"
|
||||
description: "p95 latency is {{ $value | humanizeDuration }} (threshold: 2s)"
|
||||
|
||||
- alert: ServiceDown
|
||||
expr: up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down"
|
||||
description: "{{ $labels.instance }} has been unreachable for >1 minute"
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Database Alerts
|
||||
# ----------------------------------------------------------
|
||||
- name: database_alerts
|
||||
rules:
|
||||
- alert: PostgresDown
|
||||
expr: pg_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL is down"
|
||||
description: "Cannot connect to PostgreSQL database"
|
||||
|
||||
- alert: PostgresDiskUsageHigh
|
||||
expr: |
|
||||
(pg_database_size_bytes / (1024 * 1024 * 1024)) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL database size > 10GB"
|
||||
description: "Database {{ $labels.datname }} is {{ $value | humanize }}GB"
|
||||
|
||||
- alert: DiskSpaceHigh
|
||||
expr: |
|
||||
(node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100
|
||||
> 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk usage > 80% on {{ $labels.instance }}"
|
||||
description: "{{ $labels.mountpoint }} is at {{ $value | humanizePercentage }}"
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# ML Prediction Alerts
|
||||
# ----------------------------------------------------------
|
||||
- name: ml_alerts
|
||||
rules:
|
||||
- alert: MLAccuracyDegraded
|
||||
expr: ml_prediction_accuracy_ratio{accuracy_type="top1"} < 0.30
|
||||
for: 60m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "ML top-1 accuracy below 30%"
|
||||
description: "Current accuracy: {{ $value | humanizePercentage }}"
|
||||
|
||||
- alert: MLPredictionDriftHigh
|
||||
expr: ml_prediction_drift_score > 0.5
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "ML feature drift detected"
|
||||
description: "Drift score for {{ $labels.feature_group }}: {{ $value }}"
|
||||
|
||||
- alert: NoPredictionsGenerated
|
||||
expr: increase(ml_predictions_total[1h]) == 0
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "No ML predictions generated in the last 2 hours"
|
||||
description: "Check if the scheduler is running and PMU data is being scraped"
|
||||
68
infra/prometheus/prometheus.yml
Normal file
68
infra/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,68 @@
|
||||
# ============================================================
|
||||
# Prometheus Configuration — Turf SaaS
|
||||
# ============================================================
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
project: turf-saas
|
||||
env: production
|
||||
|
||||
# Alertmanager — wire up when available
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: []
|
||||
|
||||
# Load alert rules
|
||||
rule_files:
|
||||
- "alerts.yml"
|
||||
|
||||
# ============================================================
|
||||
# Scrape targets
|
||||
# ============================================================
|
||||
scrape_configs:
|
||||
# Prometheus self-monitoring
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
- targets: [localhost:9090]
|
||||
|
||||
# Combined API
|
||||
- job_name: combined-api
|
||||
static_configs:
|
||||
- targets: [combined-api:8790]
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
|
||||
# Dashboard API
|
||||
- job_name: dashboard-api
|
||||
static_configs:
|
||||
- targets: [dashboard-api:8791]
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 15s
|
||||
|
||||
# Portal
|
||||
- job_name: portal
|
||||
static_configs:
|
||||
- targets: [portal:8792]
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
|
||||
# PostgreSQL exporter (if deployed)
|
||||
- job_name: postgres
|
||||
static_configs:
|
||||
- targets: [postgres-exporter:9187]
|
||||
scrape_interval: 30s
|
||||
|
||||
# Redis exporter (if deployed)
|
||||
- job_name: redis
|
||||
static_configs:
|
||||
- targets: [redis-exporter:9121]
|
||||
scrape_interval: 30s
|
||||
|
||||
# Node exporter (host metrics)
|
||||
- job_name: node
|
||||
static_configs:
|
||||
- targets: [host.docker.internal:9100]
|
||||
scrape_interval: 30s
|
||||
45
infra/scripts/backup_db.sh
Executable file
45
infra/scripts/backup_db.sh
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
# ============================================================
|
||||
# Automated PostgreSQL Backup Script
|
||||
# Run daily via cron: 0 2 * * * /opt/turf-saas/infra/scripts/backup_db.sh
|
||||
# ============================================================
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="${BACKUP_DIR:-/opt/backups/turf-saas}"
|
||||
KEEP_DAYS="${KEEP_DAYS:-30}"
|
||||
DB_NAME="${POSTGRES_DB:-turf_saas}"
|
||||
DB_USER="${POSTGRES_USER:-turf}"
|
||||
DB_HOST="${POSTGRES_HOST:-postgres}"
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
BACKUP_FILE="${BACKUP_DIR}/turf_saas_${TIMESTAMP}.sql.gz"
|
||||
|
||||
echo "[$(date -Iseconds)] Starting backup: ${BACKUP_FILE}"
|
||||
|
||||
# Ensure backup directory exists
|
||||
mkdir -p "${BACKUP_DIR}"
|
||||
|
||||
# Perform backup
|
||||
PGPASSWORD="${POSTGRES_PASSWORD}" pg_dump \
|
||||
-h "${DB_HOST}" \
|
||||
-U "${DB_USER}" \
|
||||
-d "${DB_NAME}" \
|
||||
--no-owner \
|
||||
--no-acl \
|
||||
| gzip > "${BACKUP_FILE}"
|
||||
|
||||
SIZE=$(du -sh "${BACKUP_FILE}" | cut -f1)
|
||||
echo "[$(date -Iseconds)] Backup complete: ${BACKUP_FILE} (${SIZE})"
|
||||
|
||||
# Remove backups older than KEEP_DAYS
|
||||
find "${BACKUP_DIR}" -name "turf_saas_*.sql.gz" -mtime "+${KEEP_DAYS}" -delete
|
||||
echo "[$(date -Iseconds)] Old backups cleaned (kept last ${KEEP_DAYS} days)"
|
||||
|
||||
# Optional: notify on completion
|
||||
if [ -n "${TELEGRAM_BOT_TOKEN:-}" ] && [ -n "${TELEGRAM_CHAT_ID:-}" ]; then
|
||||
curl -s -X POST \
|
||||
"https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-d chat_id="${TELEGRAM_CHAT_ID}" \
|
||||
-d text="✅ DB Backup OK: turf_saas ${TIMESTAMP} (${SIZE})" \
|
||||
> /dev/null || true
|
||||
fi
|
||||
Reference in New Issue
Block a user