构建容器化的大数据分析平台通常包含以下核心组件: - 容器编排平台(Kubernetes或Docker Swarm) - 分布式存储系统 - 大数据处理框架 - 数据流处理工具 - 监控与日志系统
# Ubuntu
sudo apt-get update
sudo apt-get install docker.io
sudo systemctl enable --now docker
# CentOS
sudo yum install -y yum-utils
sudo yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
sudo yum install docker-ce docker-ce-cli containerd.io
sudo systemctl enable --now docker
# 安装kubeadm, kubelet和kubectl
sudo apt-get update && sudo apt-get install -y apt-transport-https curl
curl -s https://packages.cloud.google.com/apt/doc/apt-key.gnu | sudo apt-key add -
echo "deb https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee /etc/apt/sources.list.d/kubernetes.list
sudo apt-get update
sudo apt-get install -y kubelet kubeadm kubectl
sudo apt-mark hold kubelet kubeadm kubectl
# 初始化集群
sudo kubeadm init --pod-network-cidr=10.244.0.0/16
# 使用Bitnami的Hadoop镜像
docker pull bitnami/hadoop:latest
# 示例docker-compose.yml
version: '2'
services:
namenode:
image: bitnami/hadoop:latest
environment:
- HADOOP_ROLE=NAMENODE
ports:
- "9870:9870"
- "9000:9000"
datanode:
image: bitnami/hadoop:latest
environment:
- HADOOP_ROLE=DATANODE
depends_on:
- namenode
# 官方Spark镜像
docker pull apache/spark:latest
# 启动Spark master
docker run -d -p 8080:8080 -p 7077:7077 --name spark-master apache/spark:latest /opt/spark/bin/spark-class org.apache.spark.deploy.master.Master
# 启动Spark worker
docker run -d --name spark-worker -e SPARK_MASTER=spark://<master-ip>:7077 apache/spark:latest /opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://<master-ip>:7077
# 使用官方Kafka镜像
docker pull wurstmeister/kafka
# docker-compose.yml示例
version: '3'
services:
zookeeper:
image: wurstmeister/zookeeper
ports:
- "2181:2181"
kafka:
image: wurstmeister/kafka
ports:
- "9092:9092"
environment:
KAFKA_ADVERTISED_HOST_NAME: kafka
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
volumes:
- /var/run/docker.sock:/var/run/docker.sock
# 安装Helm
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
# 添加Bitnami仓库
helm repo add bitnami https://charts.bitnami.com/bitnami
# 部署Hadoop
helm install my-hadoop bitnami/hadoop
# 部署Spark
helm install my-spark bitnami/spark
# 部署Kafka
helm install my-kafka bitnami/kafka
# hadoop-values.yaml示例
nameNode:
resources:
limits:
cpu: 2
memory: 4Gi
requests:
cpu: 1
memory: 2Gi
dataNode:
replicas: 3
resources:
limits:
cpu: 1
memory: 2Gi
requests:
cpu: 500m
memory: 1Gi
# pv.yaml
apiVersion: v1
kind: PersistentVolume
metadata:
name: hadoop-data
spec:
capacity:
storage: 100Gi
accessModes:
- ReadWriteMany
hostPath:
path: "/mnt/data/hadoop"
# aws-ebs-pv.yaml
apiVersion: v1
kind: PersistentVolume
metadata:
name: aws-ebs-pv
spec:
capacity:
storage: 100Gi
accessModes:
- ReadWriteOnce
awsElasticBlockStore:
volumeID: <volume-id>
fsType: ext4
# 部署Prometheus
helm install prometheus prometheus-community/prometheus
# 部署Grafana
helm install grafana grafana/grafana
# 部署Elasticsearch
helm install elasticsearch elastic/elasticsearch
# 部署Kibana
helm install kibana elastic/kibana
# 部署Filebeat
helm install filebeat elastic/filebeat
# network-policy.yaml
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: hadoop-network-policy
spec:
podSelector:
matchLabels:
app: hadoop
policyTypes:
- Ingress
- Egress
ingress:
- from:
- podSelector:
matchLabels:
app: spark
ports:
- protocol: TCP
port: 9000
# spark-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
namespace: default
name: spark-role
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list", "create", "delete"]
问题1:容器内存不足 - 解决方案:增加容器内存限制,调整JVM参数
问题2:数据节点无法连接 - 解决方案:检查网络策略和防火墙设置
问题3:存储性能瓶颈 - 解决方案:使用本地SSD或高性能云存储
问题4:调度延迟 - 解决方案:优化Kubernetes调度器配置,增加节点资源
通过以上步骤,您可以在Linux上成功构建一个容器化的大数据分析平台。根据实际需求,您可以灵活调整组件配置和规模。