配置高可用的容器编排平台监控需要综合考虑监控系统的可靠性、数据持久性和告警机制。以下是基于Prometheus、Grafana和Alertmanager的完整解决方案。
# prometheus-ha.yaml (Kubernetes示例)
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: prometheus
spec:
serviceName: prometheus
replicas: 2
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- name: prometheus
image: prom/prometheus
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.console.templates=/etc/prometheus/consoles"
- "--web.console.libraries=/etc/prometheus/console_libraries"
- "--storage.tsdb.retention.time=30d"
ports:
- containerPort: 9090
volumeMounts:
- name: prometheus-config
mountPath: /etc/prometheus
- name: prometheus-storage
mountPath: /prometheus
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
- name: prometheus-storage
emptyDir: {}
# prometheus.yml 部分配置
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: "production"
replica: $(HOSTNAME)
rule_files:
- /etc/prometheus/rules/*.rules
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# 部署Thanos Sidecar与每个Prometheus实例一起运行
# thanos-sidecar.yaml
- name: thanos-sidecar
image: thanosio/thanos:v0.28.0
args:
- sidecar
- --prometheus.url=http://localhost:9090
- --tsdb.path=/prometheus
- --objstore.config-file=/etc/thanos/minio-bucket.yaml
ports:
- name: http-sidecar
containerPort: 10902
- name: grpc
containerPort: 10901
volumeMounts:
- name: prometheus-storage
mountPath: /prometheus
readOnly: true
- name: thanos-config
mountPath: /etc/thanos
# thanos-query.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: thanos-query
spec:
replicas: 2
template:
spec:
containers:
- name: thanos-query
image: thanosio/thanos:v0.28.0
args:
- query
- --http-address=0.0.0.0:10902
- --grpc-address=0.0.0.0:10901
- --store=thanos-storegateway:10901
- --store=prometheus-0.thanos-sidecar:10901
- --store=prometheus-1.thanos-sidecar:10901
ports:
- containerPort: 10902
- containerPort: 10901
# grafana.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
spec:
replicas: 2
template:
spec:
containers:
- name: grafana
image: grafana/grafana:8.3.4
ports:
- containerPort: 3000
env:
- name: GF_SECURITY_ADMIN_PASSWORD
value: "securepassword"
volumeMounts:
- name: grafana-storage
mountPath: /var/lib/grafana
- name: grafana-datasources
mountPath: /etc/grafana/provisioning/datasources
volumes:
- name: grafana-storage
persistentVolumeClaim:
claimName: grafana-pvc
- name: grafana-datasources
configMap:
name: grafana-datasources
# grafana-datasources.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasources
data:
prometheus.yaml: |-
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
url: http://thanos-query:10902
access: proxy
isDefault: true
# alertmanager.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: alertmanager
spec:
serviceName: alertmanager
replicas: 3
template:
spec:
containers:
- name: alertmanager
image: prom/alertmanager:v0.24.0
args:
- --config.file=/etc/alertmanager/config.yml
- --storage.path=/alertmanager
- --cluster.advertise-address=$(POD_IP):9094
ports:
- name: http
containerPort: 9093
- name: cluster
containerPort: 9094
volumeMounts:
- name: config
mountPath: /etc/alertmanager
- name: storage
mountPath: /alertmanager
volumes:
- name: config
configMap:
name: alertmanager-config
- name: storage
emptyDir: {}
# alertmanager-config.yaml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alertmanager@example.com'
smtp_auth_username: 'user'
smtp_auth_password: 'password'
route:
group_by: ['alertname', 'cluster']
group_wait: 30s
group_interval: 5m
repeat_interval: 3h
receiver: 'team-email'
receivers:
- name: 'team-email'
email_configs:
- to: 'team@example.com'
send_resolved: true
# prometheus.yml 片段
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# alert-rules.yaml
groups:
- name: kubernetes-resources
rules:
- alert: KubeCPUOvercommit
expr: sum(namespace_cpu:kube_pod_container_resource_requests:sum) / sum(kube_node_status_allocatable_cpu_cores) > 1.5
for: 10m
labels:
severity: warning
annotations:
summary: Cluster has overcommitted CPU resources
description: "CPU requests overcommitted by {{ $value }}%"
- alert: KubeMemoryOvercommit
expr: sum(namespace_memory:kube_pod_container_resource_requests:sum) / sum(kube_node_status_allocatable_memory_bytes) > 1.5
for: 10m
labels:
severity: warning
annotations:
summary: Cluster has overcommitted memory resources
description: "Memory requests overcommitted by {{ $value }}%"
- alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[5m]) > 0
for: 10m
labels:
severity: critical
annotations:
summary: Pod {{ $labels.pod }} in {{ $labels.namespace }} is crash looping
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is restarting frequently ({{ $value }} restarts per minute)"
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'thanos'
static_configs:
- targets: ['thanos-query:10902']
- job_name: 'grafana'
metrics_path: '/metrics'
static_configs:
- targets: ['grafana:3000']
# 使用PersistentVolumeClaim
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
# thanos-storage.yaml
type: S3
config:
bucket: thanos
endpoint: minio.example.com
access_key: ACCESS_KEY
secret_key: SECRET_KEY
insecure: false
# grafana-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana
annotations:
nginx.ingress.kubernetes.io/auth-type: basic
nginx.ingress.kubernetes.io/auth-secret: grafana-auth
nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
spec:
rules:
- host: grafana.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: grafana
port:
number: 3000
# network-policy.yaml
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-monitoring
spec:
podSelector:
matchLabels:
app: prometheus
ingress:
- from:
- namespaceSelector:
matchLabels:
monitoring: enabled
ports:
- protocol: TCP
port: 9090
通过以上配置,您可以建立一个高可用的容器编排平台监控系统,能够应对节点故障、网络分区等各种异常情况,确保持续提供监控数据和服务。