1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
| /prometheus/rule/host/nodata #无数据 {"status":true,"alert":"no data","expr":"up == 0","for":"5m","summary":"no data","description":"{{$labels.instance}} no data for 5m, curr: {{ $value }}","labels":[{"key":"diyk","val":"diyv"}]}
/prometheus/rule/host/availcpult20 #cpu可用率小于20% {"status":true,"alert":"avail cpu lt 20%","expr":"avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) by (type,instance,env,ip) < 0.2","for":"5m","summary":"avail cpu lt 20%","description":"avail cpu lt 20% for 5m, curr: {{ $value }}","labels":[{"key":"diyk","val":"diyv"}]}
/prometheus/rule/host/availmemlt20 #mem可用率小于20% {"status":true,"alert":"avail mem lt 20%","expr":"1-(node_memory_MemTotal_bytes - node_memory_Cached_bytes - node_memory_Buffers_bytes - node_memory_MemFree_bytes) /node_memory_MemTotal_bytes < 0.2","for":"5m","summary":"avail mem lt 20%","description":"avail mem lt 20% for 5m, curr: {{ $value }}","labels":[{"key":"diyk","val":"diyv"}]}
/prometheus/rule/host/availdisklt20 #disk可用率小于20% {"status":true,"alert":"avail disk lt 20%","expr":"node_filesystem_avail_bytes{fstype=~\"ext.*|xfs\",mountpoint!~\".*docker.*|.*pod.*|.*container|.*kubelet\"} /node_filesystem_size_bytes{fstype=~\"ext.*|xfs\",mountpoint!~\".*docker.*|.*pod.*|.*container|.*kubelet\"} < 0.2","for":"5m","summary":"avail disk lt 20%","description":"mount: {{ $labels.mountpoint }} avail lt 20G for 5m, curr: {{ $value }}","labels":[{"key":"diyk","val":"diyv"}]}
/prometheus/rule/host/load1toohigh #1分钟负载 {"status":true,"alert":"load1 is too high","expr":"node_load1/2 > on(type,instance,env,ip) count(node_cpu_seconds_total{mode=\"system\"}) by (type,instance,env,ip)","for":"5m","summary":"load1 is too high","description":"load1 is too high for 5m, curr: {{ $value }}","labels":[{"key":"diyk","val":"diyv"}]}
/prometheus/rule/host/useiopsgt80 #iops使用率大于80% {"status": true,"alert":"iops too high","expr":"rate(node_disk_io_time_seconds_total[5m]) > 0.8","for":"5m","summary":"iops too high","description":"iops too high for 5m, curr: {{ $value }}","labels":[{"key":"diyk","val":"diyv"}]}
(1 - (node_memory_MemFree_bytes{origin_prometheus=~"$origin_prometheus",job=~"$job"} +node_memory_Buffers_bytes{origin_prometheus=~"$origin_prometheus",job=~"$job"} +node_memory_Cached_bytes{origin_prometheus=~"$origin_prometheus",job=~"$job"} / (node_memory_MemTotal_bytes{origin_prometheus=~"$origin_prometheus",job=~"$job"})))* 100
((node_memory_MemTotal_bytes{origin_prometheus=~"$origin_prometheus",job=~"$job"} - node_memory_MemFree_bytes{origin_prometheus=~"$origin_prometheus",job=~"$job"} - node_memory_Buffers_bytes{origin_prometheus=~"$origin_prometheus",job=~"$job"} - node_memory_Cached_bytes) / (node_memory_MemTotal_bytes{origin_prometheus=~"$origin_prometheus",job=~"$job"} )) * 100
#告警规则整理 1分钟的负载大于cpu核心数 持续5m node_load1 > on(instance,ip) count(node_cpu_seconds_total{mode="system"}) by (instance,ip)
CPU可用率小于20% 持续5m avg(rate(node_cpu_seconds_total{mode="system"}[5m])) by (instance) *100 avg(rate(node_cpu_seconds_total{mode="user"}[5m])) by (instance) *100 avg(rate(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance) *100 avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) *100
磁盘可用率小于20%且可用小于20G 持续5m (node_filesystem_avail_bytes{fstype=~\"ext.*|xfs\",mountpoint!~\".*pod.*|.*docker-lib.*\"} / node_filesystem_size_bytes{fstype=~\"ext.*|xfs\",mountpoint!~\".*pod.*|.*docker-lib.*\"} < 0.2) and node_filesystem_avail_bytes{fstype=~\"ext.*|xfs\",mountpoint!~\".*pod.*|.*docker-lib.*\"} < 20*1024^3
内存使用率大于80% 持续5m (node_memory_MemTotal_bytes - node_memory_Cached_bytes - node_memory_Buffers_bytes - node_memory_MemFree_bytes) /node_memory_MemTotal_bytes
IOPS write大于300 read 大于2000 持续5m rate(node_disk_reads_completed_total[5m]) > 1000 or rate(node_disk_writes_completed_total[5m]) > 200
网卡 1小时总流量 5分钟速率 increase(node_network_receive_bytes_total[60m]) /1024/1024 increase(node_network_transmit_bytes_total[60m]) /1024/1024 rate(node_network_receive_bytes_total[5m])*8 rate(node_network_transmit_bytes_total[5m])*8
|