Prometheus + Grafana + Alertmanager组合,因其: - 开源免费 - 强大的时序数据库能力 - 丰富的可视化选项 - 灵活的告警机制
# 下载最新版
wget https://github.com/prometheus/prometheus/releases/download/v2.30.3/prometheus-2.30.3.linux-amd64.tar.gz
tar xvfz prometheus-*.tar.gz
cd prometheus-*
# 创建系统服务
sudo tee /etc/systemd/system/prometheus.service <<EOF
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
ExecStart=/usr/local/bin/prometheus \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/var/lib/prometheus/ \
--web.console.templates=/etc/prometheus/consoles \
--web.console.libraries=/etc/prometheus/console_libraries
[Install]
WantedBy=multi-user.target
EOF
# 启动服务
sudo systemctl daemon-reload
sudo systemctl enable prometheus
sudo systemctl start prometheus
wget https://github.com/prometheus/mysqld_exporter/releases/download/v0.13.0/mysqld_exporter-0.13.0.linux-amd64.tar.gz
tar xvfz mysqld_exporter-*.tar.gz
cd mysqld_exporter-*
# 创建MySQL监控用户
mysql -u root -p -e "CREATE USER 'exporter'@'localhost' IDENTIFIED BY 'password' WITH MAX_USER_CONNECTIONS 3;
GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'localhost';"
# 配置系统服务
sudo tee /etc/systemd/system/mysqld_exporter.service <<EOF
[Unit]
Description=MySQL Exporter
After=network.target
[Service]
User=mysqld_exporter
Group=mysqld_exporter
Environment="DATA_SOURCE_NAME=exporter:password@(localhost:3306)/"
ExecStart=/usr/local/bin/mysqld_exporter \
--collect.global_status \
--collect.info_schema.innodb_metrics \
--collect.auto_increment.columns \
--collect.info_schema.processlist \
--collect.binlog_size \
--collect.info_schema.tablestats \
--collect.global_variables \
--collect.info_schema.query_response_time \
--collect.info_schema.userstats \
--collect.info_schema.tables \
--collect.perf_schema.tablelocks \
--collect.perf_schema.file_events \
--collect.perf_schema.eventswaits \
--collect.perf_schema.indexiowaits \
--collect.perf_schema.tableiowaits \
--collect.slave_status \
--web.listen-address=0.0.0.0:9104
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable mysqld_exporter
sudo systemctl start mysqld_exporter
wget https://github.com/prometheus-community/postgres_exporter/releases/download/v0.10.1/postgres_exporter-0.10.1.linux-amd64.tar.gz
tar xvfz postgres_exporter-*.tar.gz
cd postgres_exporter-*
# 创建系统服务
sudo tee /etc/systemd/system/postgres_exporter.service <<EOF
[Unit]
Description=PostgreSQL Exporter
After=network.target
[Service]
User=postgres
Environment="DATA_SOURCE_NAME=postgresql://postgres:password@localhost:5432/postgres?sslmode=disable"
ExecStart=/usr/local/bin/postgres_exporter \
--web.listen-address=:9187 \
--extend.query-path=/etc/postgres_exporter/queries.yaml
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable postgres_exporter
sudo systemctl start postgres_exporter
编辑/etc/prometheus/prometheus.yml
:
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'mysql'
static_configs:
- targets: ['db1:9104', 'db2:9104', 'db3:9104']
metrics_path: /metrics
- job_name: 'postgres'
static_configs:
- targets: ['db1:9187', 'db2:9187', 'db3:9187']
metrics_path: /metrics
- job_name: 'node'
static_configs:
- targets: ['db1:9100', 'db2:9100', 'db3:9100']
# Ubuntu/Debian
sudo apt-get install -y apt-transport-https
sudo apt-get install -y software-properties-common wget
wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add -
echo "deb https://packages.grafana.com/oss/deb stable main" | sudo tee -a /etc/apt/sources.list.d/grafana.list
sudo apt-get update
sudo apt-get install grafana
# CentOS/RHEL
sudo tee /etc/yum.repos.d/grafana.repo <<EOF
[grafana]
name=grafana
baseurl=https://packages.grafana.com/oss/rpm
repo_gpgcheck=1
enabled=1
gpgcheck=1
gpgkey=https://packages.grafana.com/gpg.key
sslverify=1
sslcacert=/etc/pki/tls/certs/ca-bundle.crt
EOF
sudo yum install grafana
# 启动服务
sudo systemctl daemon-reload
sudo systemctl enable grafana-server
sudo systemctl start grafana-server
http://<grafana-server>:3000
(默认admin/admin)http://<prometheus-server>:9090
推荐仪表板: - MySQL: ID 7362 - PostgreSQL: ID 9628 - Node Exporter: ID 1860
导入方法: 1. 左侧菜单 → Create → Import 2. 输入仪表板ID 3. 选择Prometheus数据源 4. 点击Import
配置多个Prometheus实例,使用相同的配置抓取相同的目标:
# 在prometheus.yml中添加
global:
external_labels:
replica: 'A' # 在另一个实例上设置为'B'
# 安装Alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.23.0/alertmanager-0.23.0.linux-amd64.tar.gz
tar xvfz alertmanager-*.tar.gz
cd alertmanager-*
# 配置集群
sudo tee /etc/alertmanager/alertmanager.yml <<EOF
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 5m
repeat_interval: 3h
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: 'admin@example.com'
from: 'alertmanager@example.com'
smarthost: 'smtp.example.com:587'
auth_username: 'alertmanager@example.com'
auth_password: 'password'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
EOF
# 启动服务
sudo systemctl enable alertmanager
sudo systemctl start alertmanager
mysql_global_status_uptime
: 数据库运行时间mysql_global_status_threads_connected
: 当前连接数mysql_global_variables_max_connections
: 最大连接数mysql_global_status_questions
: 查询率mysql_global_status_slow_queries
: 慢查询数mysql_global_status_innodb_row_lock_time_avg
: InnoDB平均行锁时间pg_stat_activity_count
: 活动连接数pg_stat_database_xact_commit
: 事务提交率pg_stat_database_xact_rollback
: 事务回滚率pg_stat_user_tables_seq_scan
: 顺序扫描次数pg_stat_user_tables_idx_scan
: 索引扫描次数pg_stat_replication
: 复制状态在Prometheus中配置告警规则(/etc/prometheus/rules.yml
):
groups:
- name: mysql-alerts
rules:
- alert: MySQLDown
expr: up{job="mysql"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "MySQL instance down (instance {{ $labels.instance }})"
description: "MySQL instance is down for more than 1 minute"
- alert: HighThreadsRunning
expr: mysql_global_status_threads_running > 50
for: 5m
labels:
severity: warning
annotations:
summary: "High number of running threads (instance {{ $labels.instance }})"
description: "Number of running threads is {{ $value }}"
- name: postgres-alerts
rules:
- alert: PostgreSQLDown
expr: up{job="postgres"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL instance down (instance {{ $labels.instance }})"
description: "PostgreSQL instance is down for more than 1 minute"
- alert: HighPostgreSQLConnections
expr: pg_stat_activity_count > (pg_settings_max_connections * 0.8)
for: 5m
labels:
severity: warning
annotations:
summary: "High number of PostgreSQL connections (instance {{ $labels.instance }})"
description: "Number of connections is {{ $value }} (max is {{ pg_settings_max_connections }})"
在prometheus.yml
中引用规则文件:
rule_files:
- '/etc/prometheus/rules.yml'
Prometheus存储优化:
storage.tsdb.retention.time
控制数据保留时间监控数据采样优化:
scrape_interval
告警优化:
group_wait
和group_interval
高可用扩展:
通过以上配置,您可以建立一个功能完善、高可用的数据库集群监控系统,实时掌握数据库集群的健康状态和性能指标。