一个高可用的监控报警系统通常包含以下组件: 1. 数据采集层:Prometheus、Telegraf等 2. 存储层:Prometheus TSDB、InfluxDB、VictoriaMetrics等 3. 报警管理层:Alertmanager 4. 可视化层:Grafana 5. 高可用保障:负载均衡、集群部署、数据复制
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- 'alert.rules'
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node'
static_configs:
- targets: ['node1:9100', 'node2:9100', 'node3:9100']
- job_name: 'secondary-prometheus'
honor_labels: true
metrics_path: '/federate'
params:
'match[]':
- '{job="node"}'
static_configs:
- targets: ['prometheus-secondary:9090']
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node'
static_configs:
- targets: ['node4:9100', 'node5:9100', 'node6:9100']
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'cluster']
group_wait: 30s
group_interval: 5m
repeat_interval: 3h
receiver: 'team-email'
receivers:
- name: 'team-email'
email_configs:
- to: 'team@example.com'
from: 'alertmanager@example.com'
smarthost: 'smtp.example.com:587'
auth_username: 'alertmanager'
auth_password: 'password'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'cluster']
# 节点1
alertmanager --config.file=alertmanager.yml \
--cluster.peer=node1:9094 \
--cluster.peer=node2:9094 \
--cluster.peer=node3:9094
# 节点2和节点3类似,确保所有节点互相知道对方
[database]
type = mysql
host = mysql-ha:3306
name = grafana
user = grafana
password = securepassword
ssl_mode = disable
[session]
provider = mysql
provider_config = user:password@tcp(mysql-ha:3306)/grafana
在Nginx或HAProxy后部署多个Grafana实例:
upstream grafana {
server grafana1:3000;
server grafana2:3000;
server grafana3:3000;
}
server {
listen 80;
server_name grafana.example.com;
location / {
proxy_pass http://grafana;
}
}
groups:
- name: node-alerts
rules:
- alert: HighNodeLoad
expr: node_load5 > 1.5
for: 10m
labels:
severity: warning
annotations:
summary: "High load on {{ $labels.instance }}"
description: "{{ $labels.instance }} has high load: {{ $value }}"
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
故障转移测试:
负载测试:
prometheus-benchmark
工具模拟高负载数据一致性检查:
定期备份:
监控监控系统:
容量规划:
日志记录:
通过以上配置,您可以建立一个高可用的Linux监控报警系统,能够承受单点故障并持续提供服务。