要实现Linux环境下高可用的网络存储监控,我建议采用以下架构: - 存储层:使用DRBD+Heartbeat实现存储高可用 - 监控层:使用Prometheus+Grafana+Alertmanager实现监控告警 - 可选:结合Zabbix进行更全面的系统监控
# 在两台服务器上安装DRBD
sudo apt-get install drbd-utils drbd8-utils # Debian/Ubuntu
sudo yum install drbd kmod-drbd84 # RHEL/CentOS
# 配置DRBD资源
sudo vi /etc/drbd.d/nfs.res
# 内容示例:
resource nfs {
protocol C;
device /dev/drbd0;
disk /dev/sdb1;
meta-disk internal;
on node1 {
address 192.168.1.101:7789;
}
on node2 {
address 192.168.1.102:7789;
}
}
# 初始化并启动DRBD
sudo drbdadm create-md nfs
sudo drbdadm up nfs
# 在主节点执行
sudo drbdadm primary --force nfs
# 在两台服务器上安装Heartbeat
sudo apt-get install heartbeat # Debian/Ubuntu
sudo yum install heartbeat # RHEL/CentOS
# 配置Heartbeat
sudo vi /etc/ha.d/ha.cf
# 内容示例:
logfile /var/log/ha-log
keepalive 2
deadtime 10
warntime 5
udpport 694
auto_failback on
node node1 node2
sudo vi /etc/ha.d/haresources
# 内容示例:
node1 IPaddr::192.168.1.100/24/eth0 drbddisk::nfs Filesystem::/dev/drbd0::/nfs::ext4
# 启动Heartbeat
sudo service heartbeat start
# 下载并安装Prometheus
wget https://github.com/prometheus/prometheus/releases/download/v2.30.3/prometheus-2.30.3.linux-amd64.tar.gz
tar xvfz prometheus-*.tar.gz
cd prometheus-*
# 配置prometheus.yml
vi prometheus.yml
# 添加以下内容:
scrape_configs:
- job_name: 'node'
static_configs:
- targets: ['192.168.1.101:9100', '192.168.1.102:9100']
- job_name: 'drbd'
static_configs:
- targets: ['192.168.1.101:9106', '192.168.1.102:9106']
# 启动Prometheus
./prometheus --config.file=prometheus.yml &
# 在所有节点上安装Node Exporter
wget https://github.com/prometheus/node_exporter/releases/download/v1.2.2/node_exporter-1.2.2.linux-amd64.tar.gz
tar xvfz node_exporter-*.tar.gz
cd node_exporter-*
./node_exporter &
# 在所有节点上安装DRBD Exporter
wget https://github.com/metalmatze/drbd-exporter/releases/download/v0.10.0/drbd-exporter-0.10.0.linux-amd64.tar.gz
tar xvfz drbd-exporter-*.tar.gz
cd drbd-exporter-*
./drbd-exporter &
# 安装Grafana
# Debian/Ubuntu
sudo apt-get install -y apt-transport-https
sudo apt-get install -y software-properties-common wget
wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add -
echo "deb https://packages.grafana.com/oss/deb stable main" | sudo tee -a /etc/apt/sources.list.d/grafana.list
sudo apt-get update
sudo apt-get install grafana
# 启动Grafana
sudo systemctl daemon-reload
sudo systemctl start grafana-server
sudo systemctl enable grafana-server
# 访问http://<server-ip>:3000
# 默认用户名/密码:admin/admin
wget https://github.com/prometheus/alertmanager/releases/download/v0.23.0/alertmanager-0.23.0.linux-amd64.tar.gz
tar xvfz alertmanager-*.tar.gz
cd alertmanager-*
# 配置alertmanager.yml
vi alertmanager.yml
# 示例配置:
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: 'admin@example.com'
from: 'alertmanager@example.com'
smarthost: 'smtp.example.com:587'
auth_username: 'alertmanager@example.com'
auth_password: 'password'
# 启动Alertmanager
./alertmanager --config.file=alertmanager.yml &
# 在prometheus.yml中添加
rule_files:
- 'alert.rules'
# 创建alert.rules文件
vi alert.rules
# 示例规则:
groups:
- name: drbd.rules
rules:
- alert: DRBDPrimaryLost
expr: drbd_role{resource="nfs"} != 2
for: 5m
labels:
severity: critical
annotations:
summary: "DRBD primary lost on {{ $labels.instance }}"
description: "DRBD resource {{ $labels.resource }} is no longer primary on {{ $labels.instance }}"
# 安装Zabbix Server
# RHEL/CentOS
sudo rpm -Uvh https://repo.zabbix.com/zabbix/5.4/rhel/7/x86_64/zabbix-release-5.4-1.el7.noarch.rpm
sudo yum install zabbix-server-mysql zabbix-agent
# 配置Zabbix监控DRBD
# 在Zabbix前端导入DRBD监控模板
# 配置触发器监控DRBD状态
cat /proc/drbd
journalctl -u heartbeat -f
此方案提供了端到端的高可用网络存储监控解决方案,可根据实际环境需求进行调整和扩展。