Operations Guide
This guide covers operational procedures for managing the Sparbz Cloud platform.
Deployment
Initial Deployment
# Add Helm repository
helm repo add sparbz https://charts.sparbz.cloud
helm repo update
# Deploy to Kubernetes
helm install szc sparbz/szc \
--namespace szc \
--create-namespace \
-f values-production.yaml
Verifying Deployment
# Check pod status
kubectl get pods -n szc
# Expected output:
# NAME READY STATUS RESTARTS AGE
# szc-api-xxxxx 1/1 Running 0 2m
# szc-api-xxxxx 1/1 Running 0 2m
# szc-event-bridge-xxxxx 1/1 Running 0 2m
# szc-event-bridge-xxxxx 1/1 Running 0 2m
# szc-status-worker-xxxxx 1/1 Running 0 2m
# szc-usage-collector-xxxxx 1/1 Running 0 2m
# Check CronJobs are scheduled
kubectl get cronjobs -n szc
# Expected:
# NAME SCHEDULE SUSPEND ACTIVE LAST SCHEDULE AGE
# szc-usage-aggregator 0 * * * * False 0 <none> 2m
# szc-meter-sync 0 2 * * * False 0 <none> 2m
# szc-garbage-collector 0 3 * * * False 0 <none> 2m
Scaling
Scale API Server
# Scale to 3 replicas
kubectl scale deployment szc-api -n szc --replicas=3
# Verify
kubectl get deployment szc-api -n szc
Scale Event Bridge
# For high WebSocket load
kubectl scale deployment szc-event-bridge -n szc --replicas=4
View Current Load
# Check resource usage
kubectl top pods -n szc
# Expected output:
# NAME CPU(cores) MEMORY(Mi)
# szc-api-xxxxx 45m 120Mi
# szc-event-bridge-xxxxx 30m 85Mi
Monitoring
Check Service Health
# API health check
curl https://api.sparbz.cloud/health
# Expected response:
# {"status":"healthy","uptime":"24h15m32s"}
View Logs
# View API logs
kubectl logs -n szc deployment/szc-api --tail=100 -f
# View Event Bridge logs
kubectl logs -n szc deployment/szc-event-bridge --tail=100 -f
# View Status Worker logs
kubectl logs -n szc deployment/szc-status-worker --tail=100 -f
# View Usage Collector logs
kubectl logs -n szc deployment/szc-usage-collector --tail=100 -f
# View CronJob logs (last run)
kubectl logs -n szc job/szc-usage-aggregator-xxxxx
kubectl logs -n szc job/szc-meter-sync-xxxxx
kubectl logs -n szc job/szc-garbage-collector-xxxxx
Monitor Metrics
# View Prometheus metrics
curl https://api.sparbz.cloud/metrics
# Check specific metrics:
# - http_requests_total
# - http_request_duration_seconds
# - database_connections_active
Usage & Billing
Monitor Usage
Check the API to view current usage:
# Get current period usage
curl -H "Authorization: Bearer $TOKEN" \
https://api.sparbz.cloud/api/v1/usage
# Get usage by resource
curl -H "Authorization: Bearer $TOKEN" \
https://api.sparbz.cloud/api/v1/usage/databases
# Get cost breakdown
curl -H "Authorization: Bearer $TOKEN" \
https://api.sparbz.cloud/api/v1/usage/costs?period=current
Verify Billing
Check the aggregation and sync processes:
# Check if aggregator is running
kubectl get cronjob szc-usage-aggregator -n szc
# View last aggregation job
kubectl get jobs -n szc | grep usage-aggregator
kubectl logs -n szc job/szc-usage-aggregator-$(date +%Y%m%d*)
# Check if meter-sync is running
kubectl logs -n szc job/szc-meter-sync-$(date +%Y%m%d*) --tail=50
Manual Aggregation (if needed)
# Trigger aggregator job manually
kubectl create job --from=cronjob/szc-usage-aggregator \
manual-aggregator-$(date +%s) -n szc
# View job progress
kubectl logs -n szc -f job/manual-aggregator-*
Manual Meter Sync (if needed)
# Trigger meter-sync manually
kubectl create job --from=cronjob/szc-meter-sync \
manual-sync-$(date +%s) -n szc
# View job progress
kubectl logs -n szc -f job/manual-sync-*
Resource Cleanup
Enable Garbage Collection
By default, garbage collector runs in dry-run mode. To enable actual deletion:
# Update Helm values
helm upgrade szc sparbz/szc -n szc \
--set garbageCollector.dryRun=false
Monitor Cleanup Operations
# Check garbage collector status
kubectl logs -n szc job/szc-garbage-collector-$(date +%Y%m%d*) --tail=50
# Example output:
# Starting garbage collection (dryRun=false, retention=30 days)
# Found 5 orphaned PVCs
# Cleaned 5 orphaned PVCs
# Purged 12 soft-deleted database records
# Cleaned 8 old backup records
# Garbage collection completed: 25 total items cleaned
Adjust Retention Period
# Keep soft-deleted records for 60 days before purging
helm upgrade szc sparbz/szc -n szc \
--set garbageCollector.retentionDays=60
Database Management
Database Backups
# View backup policies
kubectl get backupstoragelocations -n szc
# Trigger manual backup
kubectl exec -it deployment/szc-api -n szc -- \
sqlite3 /data/backup.db "SELECT * FROM backups;"
Database Migrations
# Check migration status
kubectl logs -n szc job/szc-migrate-* --tail=20
# Run migrations manually (if needed)
kubectl exec -it deployment/szc-api -n szc -- \
atlas migrate apply
Database Cleanup
# Connect to database
kubectl port-forward -n szc svc/postgres 5432:5432 &
# Query for orphaned records (example)
psql $DATABASE_URL -c "
SELECT COUNT(*) FROM databases WHERE deleted_at IS NOT NULL;
"
# Kill port-forward
kill %1
Troubleshooting
API Not Responding
# Check pod status
kubectl describe pod -n szc deployment/szc-api
# View recent logs
kubectl logs -n szc deployment/szc-api --tail=50 --previous
# Check service connectivity
kubectl exec -it -n szc deployment/szc-api -- \
curl -v http://localhost:8080/health
# Restart deployment
kubectl rollout restart deployment/szc-api -n szc
CronJob Failures
# Check CronJob status
kubectl describe cronjob szc-usage-aggregator -n szc
# View failed job
kubectl get jobs -n szc | grep usage-aggregator
kubectl describe job/szc-usage-aggregator-xxxxx -n szc
# Check job logs
kubectl logs -n szc job/szc-usage-aggregator-xxxxx
# Delete failed job (allows new run)
kubectl delete job szc-usage-aggregator-xxxxx -n szc
Database Connection Issues
# Verify secret
kubectl get secret -n szc szc-db -o jsonpath='{.data.url}' | base64 -d
# Test connection from pod
kubectl run -it --rm debug --image=postgres:15 --restart=Never -n szc -- \
psql "$(kubectl get secret -n szc szc-db -o jsonpath='{.data.url}' | base64 -d)"
Kafka/Event Issues
# Check Kafka connectivity
kubectl get kafka -n kafka
# Check consumer groups
kubectl exec -it -n kafka kafka-controller-0 -- \
kafka-consumer-groups.sh --bootstrap-server localhost:9092 --list
# Reset consumer group (if needed)
kubectl exec -it -n kafka kafka-controller-0 -- \
kafka-consumer-groups.sh --bootstrap-server localhost:9092 \
--group szc-api --reset-offsets --to-earliest
Updates & Patches
Update Service
# Update Helm chart
helm repo update
helm search repo sparbz
# Upgrade deployment
helm upgrade szc sparbz/szc -n szc \
--set api.image.tag=v1.2.31
# Verify rollout
kubectl rollout status deployment/szc-api -n szc
Rollback on Issues
# View revision history
helm history szc -n szc
# Rollback to previous version
helm rollback szc 1 -n szc
# Verify
kubectl rollout status deployment/szc-api -n szc
Maintenance Windows
Schedule Maintenance
# Suspend CronJobs during maintenance
kubectl patch cronjob szc-usage-aggregator -n szc -p '{"spec":{"suspend":true}}'
kubectl patch cronjob szc-meter-sync -n szc -p '{"spec":{"suspend":true}}'
kubectl patch cronjob szc-garbage-collector -n szc -p '{"spec":{"suspend":true}}'
# Resume after maintenance
kubectl patch cronjob szc-usage-aggregator -n szc -p '{"spec":{"suspend":false}}'
kubectl patch cronjob szc-meter-sync -n szc -p '{"spec":{"suspend":false}}'
kubectl patch cronjob szc-garbage-collector -n szc -p '{"spec":{"suspend":false}}'
Performance Tuning
Adjust Resource Limits
# Increase API memory
helm upgrade szc sparbz/szc -n szc \
--set api.resources.limits.memory=1Gi
# Increase CPU for aggregator
helm upgrade szc sparbz/szc -n szc \
--set usageAggregator.resources.limits.cpu=1000m
Connection Pooling
# Adjust database connection pool
helm upgrade szc sparbz/szc -n szc \
--set api.env.DB_POOL_SIZE=30
Disaster Recovery
Database Recovery
# List available backups
kubectl get backup -n szc
# Restore from backup
kubectl patch backup my-backup -n szc -p '{"status":{"phase":"restoring"}}'
Event Replay
# Replay usage events from Kafka
kubectl create job --from=cronjob/szc-usage-aggregator \
replay-job -n szc
# Watch progress
kubectl logs -n szc -f job/replay-job
Checklists
Daily Operations
- Check pod status (
kubectl get pods -n szc) - Verify CronJob execution
- Check API health (
curl /health) - Monitor error rates in metrics
Weekly
- Review logs for warnings/errors
- Check database connection count
- Verify backups are running
- Review Stripe billing sync status
Monthly
- Review resource utilization
- Check for unused resources
- Verify disaster recovery procedures
- Update dependencies
See Architecture for system design details.