groups: - name: system.rules interval: 30s rules: - alert: HighCPUUsage expr: 100 - (avg by(instance)(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 for: 2m labels: severity: warning annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage > 90% for 2 minutes." - alert: HighMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85 for: 2m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage > 85% for 2 minutes." - alert: LowDiskSpace expr: (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100 < 15 for: 5m labels: severity: critical annotations: summary: "Low disk space on {{ $labels.instance }} {{ $labels.mountpoint }}" description: "Disk space < 15% available for 5 minutes." - name: docker.rules interval: 30s rules: - alert: ContainerRestartingFrequently expr: rate(container_restart_count[10m]) > 3 for: 5m labels: severity: warning annotations: summary: "Container restarting frequently: {{ $labels.container_label_com_docker_swarm_service_name }}" description: "Container restarted more than 3 times in 10 minutes."