# 编辑/etc/sysctl.conf添加以下参数
vm.swappiness = 10
vm.overcommit_memory = 1
kernel.shmall = 4294967296
kernel.shmmax = 68719476736
kernel.shmmni = 4096
kernel.sem = 250 32000 100 128
net.ipv4.ip_local_port_range = 1024 65535
net.core.rmem_default = 8388608
net.core.rmem_max = 16777216
net.core.wmem_default = 8388608
net.core.wmem_max = 16777216
# 应用配置
sysctl -p
noatime,nodiratime
# 编辑/etc/security/limits.conf
* soft nofile 65536
* hard nofile 65536
* soft nproc 65536
* hard nproc 65536
# 安装OpenJDK
sudo apt-get install openjdk-8-jdk # Ubuntu/Debian
sudo yum install java-1.8.0-openjdk # CentOS/RHEL
# 设置JAVA_HOME
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
核心配置文件示例(hdfs-site.xml):
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/data/hadoop/hdfs/nn</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/data/hadoop/hdfs/dn</value>
</property>
<property>
<name>dfs.blocksize</name>
<value>256m</value>
</property>
</configuration>
# 下载并解压
wget https://downloads.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
tar -xzf spark-3.3.1-bin-hadoop3.tgz
mv spark-3.3.1-bin-hadoop3 /opt/spark
# 环境变量配置
export SPARK_HOME=/opt/spark
export PATH=$PATH:$SPARK_HOME/bin
# 下载并安装
wget https://downloads.apache.org/kafka/3.3.1/kafka_2.13-3.3.1.tgz
tar -xzf kafka_2.13-3.3.1.tgz
mv kafka_2.13-3.3.1 /opt/kafka
# 启动Zookeeper和Kafka服务
/opt/kafka/bin/zookeeper-server-start.sh /opt/kafka/config/zookeeper.properties &
/opt/kafka/bin/kafka-server-start.sh /opt/kafka/config/server.properties &
# 安装Prometheus
wget https://github.com/prometheus/prometheus/releases/download/v2.39.1/prometheus-2.39.1.linux-amd64.tar.gz
tar -xzf prometheus-*.tar.gz
cd prometheus-*
# 配置Prometheus监控Hadoop/Spark
# 编辑prometheus.yml添加相关job
scrape_configs:
- job_name: 'hadoop'
static_configs:
- targets: ['namenode:9100', 'datanode1:9100', 'datanode2:9100']
- job_name: 'spark'
static_configs:
- targets: ['spark-master:7077']
# Ubuntu/Debian
sudo apt-get install -y adduser libfontconfig1
wget https://dl.grafana.com/enterprise/release/grafana-enterprise_9.2.3_amd64.deb
sudo dpkg -i grafana-enterprise_9.2.3_amd64.deb
# CentOS/RHEL
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-9.2.3-1.x86_64.rpm
sudo yum install grafana-enterprise-9.2.3-1.x86_64.rpm
# 安装Kerberos服务
sudo apt-get install krb5-kdc krb5-admin-server # Ubuntu/Debian
sudo yum install krb5-server krb5-workstation # CentOS/RHEL
# 配置Hadoop使用Kerberos
# 在core-site.xml中添加
<property>
<name>hadoop.security.authentication</name>
<value>kerberos</value>
</property>
# 开放大数据组件常用端口
sudo firewall-cmd --permanent --add-port=8020/tcp # HDFS NameNode
sudo firewall-cmd --permanent --add-port=50070/tcp # HDFS Web UI
sudo firewall-cmd --permanent --add-port=8088/tcp # YARN ResourceManager
sudo firewall-cmd --permanent --add-port=9092/tcp # Kafka
sudo firewall-cmd --reload
version: '3'
services:
namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
container_name: namenode
ports:
- "9870:9870"
- "9000:9000"
datanode:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
depends_on:
- namenode
spark-master:
image: bde2020/spark-master:3.1.1-hadoop3.2
ports:
- "8080:8080"
- "7077:7077"
spark-worker:
image: bde2020/spark-worker:3.1.1-hadoop3.2
depends_on:
- spark-master
# 生成测试数据
hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar teragen 10000000 /tera/input
# 运行排序测试
hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar terasort /tera/input /tera/output
# 验证结果
hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar teravalidate /tera/output /tera/validate
以上配置可根据实际硬件资源和工作负载需求进行调整,建议在生产环境部署前进行充分的测试和性能调优。