Skip to main content

Operations Guide

This guide covers operational procedures for managing the Sparbz Cloud platform.

Deployment

Initial Deployment

# Add Helm repository
helm repo add sparbz https://charts.sparbz.cloud
helm repo update

# Deploy to Kubernetes
helm install szc sparbz/szc \
--namespace szc \
--create-namespace \
-f values-production.yaml

Verifying Deployment

# Check pod status
kubectl get pods -n szc

# Expected output:
# NAME READY STATUS RESTARTS AGE
# szc-api-xxxxx 1/1 Running 0 2m
# szc-api-xxxxx 1/1 Running 0 2m
# szc-event-bridge-xxxxx 1/1 Running 0 2m
# szc-event-bridge-xxxxx 1/1 Running 0 2m
# szc-status-worker-xxxxx 1/1 Running 0 2m
# szc-usage-collector-xxxxx 1/1 Running 0 2m

# Check CronJobs are scheduled
kubectl get cronjobs -n szc

# Expected:
# NAME SCHEDULE SUSPEND ACTIVE LAST SCHEDULE AGE
# szc-usage-aggregator 0 * * * * False 0 <none> 2m
# szc-meter-sync 0 2 * * * False 0 <none> 2m
# szc-garbage-collector 0 3 * * * False 0 <none> 2m

Scaling

Scale API Server

# Scale to 3 replicas
kubectl scale deployment szc-api -n szc --replicas=3

# Verify
kubectl get deployment szc-api -n szc

Scale Event Bridge

# For high WebSocket load
kubectl scale deployment szc-event-bridge -n szc --replicas=4

View Current Load

# Check resource usage
kubectl top pods -n szc

# Expected output:
# NAME CPU(cores) MEMORY(Mi)
# szc-api-xxxxx 45m 120Mi
# szc-event-bridge-xxxxx 30m 85Mi

Monitoring

Check Service Health

# API health check
curl https://api.sparbz.cloud/health

# Expected response:
# {"status":"healthy","uptime":"24h15m32s"}

View Logs

# View API logs
kubectl logs -n szc deployment/szc-api --tail=100 -f

# View Event Bridge logs
kubectl logs -n szc deployment/szc-event-bridge --tail=100 -f

# View Status Worker logs
kubectl logs -n szc deployment/szc-status-worker --tail=100 -f

# View Usage Collector logs
kubectl logs -n szc deployment/szc-usage-collector --tail=100 -f

# View CronJob logs (last run)
kubectl logs -n szc job/szc-usage-aggregator-xxxxx
kubectl logs -n szc job/szc-meter-sync-xxxxx
kubectl logs -n szc job/szc-garbage-collector-xxxxx

Monitor Metrics

# View Prometheus metrics
curl https://api.sparbz.cloud/metrics

# Check specific metrics:
# - http_requests_total
# - http_request_duration_seconds
# - database_connections_active

Usage & Billing

Monitor Usage

Check the API to view current usage:

# Get current period usage
curl -H "Authorization: Bearer $TOKEN" \
https://api.sparbz.cloud/api/v1/usage

# Get usage by resource
curl -H "Authorization: Bearer $TOKEN" \
https://api.sparbz.cloud/api/v1/usage/databases

# Get cost breakdown
curl -H "Authorization: Bearer $TOKEN" \
https://api.sparbz.cloud/api/v1/usage/costs?period=current

Verify Billing

Check the aggregation and sync processes:

# Check if aggregator is running
kubectl get cronjob szc-usage-aggregator -n szc

# View last aggregation job
kubectl get jobs -n szc | grep usage-aggregator
kubectl logs -n szc job/szc-usage-aggregator-$(date +%Y%m%d*)

# Check if meter-sync is running
kubectl logs -n szc job/szc-meter-sync-$(date +%Y%m%d*) --tail=50

Manual Aggregation (if needed)

# Trigger aggregator job manually
kubectl create job --from=cronjob/szc-usage-aggregator \
manual-aggregator-$(date +%s) -n szc

# View job progress
kubectl logs -n szc -f job/manual-aggregator-*

Manual Meter Sync (if needed)

# Trigger meter-sync manually
kubectl create job --from=cronjob/szc-meter-sync \
manual-sync-$(date +%s) -n szc

# View job progress
kubectl logs -n szc -f job/manual-sync-*

Resource Cleanup

Enable Garbage Collection

By default, garbage collector runs in dry-run mode. To enable actual deletion:

# Update Helm values
helm upgrade szc sparbz/szc -n szc \
--set garbageCollector.dryRun=false

Monitor Cleanup Operations

# Check garbage collector status
kubectl logs -n szc job/szc-garbage-collector-$(date +%Y%m%d*) --tail=50

# Example output:
# Starting garbage collection (dryRun=false, retention=30 days)
# Found 5 orphaned PVCs
# Cleaned 5 orphaned PVCs
# Purged 12 soft-deleted database records
# Cleaned 8 old backup records
# Garbage collection completed: 25 total items cleaned

Adjust Retention Period

# Keep soft-deleted records for 60 days before purging
helm upgrade szc sparbz/szc -n szc \
--set garbageCollector.retentionDays=60

Database Management

Database Backups

# View backup policies
kubectl get backupstoragelocations -n szc

# Trigger manual backup
kubectl exec -it deployment/szc-api -n szc -- \
sqlite3 /data/backup.db "SELECT * FROM backups;"

Database Migrations

# Check migration status
kubectl logs -n szc job/szc-migrate-* --tail=20

# Run migrations manually (if needed)
kubectl exec -it deployment/szc-api -n szc -- \
atlas migrate apply

Database Cleanup

# Connect to database
kubectl port-forward -n szc svc/postgres 5432:5432 &

# Query for orphaned records (example)
psql $DATABASE_URL -c "
SELECT COUNT(*) FROM databases WHERE deleted_at IS NOT NULL;
"

# Kill port-forward
kill %1

Troubleshooting

API Not Responding

# Check pod status
kubectl describe pod -n szc deployment/szc-api

# View recent logs
kubectl logs -n szc deployment/szc-api --tail=50 --previous

# Check service connectivity
kubectl exec -it -n szc deployment/szc-api -- \
curl -v http://localhost:8080/health

# Restart deployment
kubectl rollout restart deployment/szc-api -n szc

CronJob Failures

# Check CronJob status
kubectl describe cronjob szc-usage-aggregator -n szc

# View failed job
kubectl get jobs -n szc | grep usage-aggregator
kubectl describe job/szc-usage-aggregator-xxxxx -n szc

# Check job logs
kubectl logs -n szc job/szc-usage-aggregator-xxxxx

# Delete failed job (allows new run)
kubectl delete job szc-usage-aggregator-xxxxx -n szc

Database Connection Issues

# Verify secret
kubectl get secret -n szc szc-db -o jsonpath='{.data.url}' | base64 -d

# Test connection from pod
kubectl run -it --rm debug --image=postgres:15 --restart=Never -n szc -- \
psql "$(kubectl get secret -n szc szc-db -o jsonpath='{.data.url}' | base64 -d)"

Kafka/Event Issues

# Check Kafka connectivity
kubectl get kafka -n kafka

# Check consumer groups
kubectl exec -it -n kafka kafka-controller-0 -- \
kafka-consumer-groups.sh --bootstrap-server localhost:9092 --list

# Reset consumer group (if needed)
kubectl exec -it -n kafka kafka-controller-0 -- \
kafka-consumer-groups.sh --bootstrap-server localhost:9092 \
--group szc-api --reset-offsets --to-earliest

Updates & Patches

Update Service

# Update Helm chart
helm repo update
helm search repo sparbz

# Upgrade deployment
helm upgrade szc sparbz/szc -n szc \
--set api.image.tag=v1.2.31

# Verify rollout
kubectl rollout status deployment/szc-api -n szc

Rollback on Issues

# View revision history
helm history szc -n szc

# Rollback to previous version
helm rollback szc 1 -n szc

# Verify
kubectl rollout status deployment/szc-api -n szc

Maintenance Windows

Schedule Maintenance

# Suspend CronJobs during maintenance
kubectl patch cronjob szc-usage-aggregator -n szc -p '{"spec":{"suspend":true}}'
kubectl patch cronjob szc-meter-sync -n szc -p '{"spec":{"suspend":true}}'
kubectl patch cronjob szc-garbage-collector -n szc -p '{"spec":{"suspend":true}}'

# Resume after maintenance
kubectl patch cronjob szc-usage-aggregator -n szc -p '{"spec":{"suspend":false}}'
kubectl patch cronjob szc-meter-sync -n szc -p '{"spec":{"suspend":false}}'
kubectl patch cronjob szc-garbage-collector -n szc -p '{"spec":{"suspend":false}}'

Performance Tuning

Adjust Resource Limits

# Increase API memory
helm upgrade szc sparbz/szc -n szc \
--set api.resources.limits.memory=1Gi

# Increase CPU for aggregator
helm upgrade szc sparbz/szc -n szc \
--set usageAggregator.resources.limits.cpu=1000m

Connection Pooling

# Adjust database connection pool
helm upgrade szc sparbz/szc -n szc \
--set api.env.DB_POOL_SIZE=30

Disaster Recovery

Database Recovery

# List available backups
kubectl get backup -n szc

# Restore from backup
kubectl patch backup my-backup -n szc -p '{"status":{"phase":"restoring"}}'

Event Replay

# Replay usage events from Kafka
kubectl create job --from=cronjob/szc-usage-aggregator \
replay-job -n szc

# Watch progress
kubectl logs -n szc -f job/replay-job

Checklists

Daily Operations

  • Check pod status (kubectl get pods -n szc)
  • Verify CronJob execution
  • Check API health (curl /health)
  • Monitor error rates in metrics

Weekly

  • Review logs for warnings/errors
  • Check database connection count
  • Verify backups are running
  • Review Stripe billing sync status

Monthly

  • Review resource utilization
  • Check for unused resources
  • Verify disaster recovery procedures
  • Update dependencies

See Architecture for system design details.