集群部署与运维
部署架构规划
典型生产集群规划
┌─────────────────────────────────────────────────────────────┐
│ 生产集群规划(示例) │
│ │
│ 管理节点(3台,高可用) │
│ ├── NameNode(Active + Standby) │
│ ├── ResourceManager(Active + Standby) │
│ ├── HMaster │
│ ├── Zookeeper(3节点) │
│ └── JournalNode(3节点) │
│ │
│ 数据节点(N台,按需扩展) │
│ ├── DataNode │
│ ├── NodeManager │
│ └── RegionServer(HBase) │
│ │
│ 计算节点(可与数据节点合并) │
│ ├── Spark Executor │
│ └── Flink TaskManager │
│ │
│ 服务节点(独立部署) │
│ ├── HiveServer2 │
│ ├── Kafka Broker(3台) │
│ ├── Airflow(Scheduler + WebServer) │
│ └── 监控(Prometheus + Grafana) │
└─────────────────────────────────────────────────────────────┘硬件配置建议
| 节点类型 | CPU | 内存 | 磁盘 | 网络 |
|---|---|---|---|---|
| 管理节点 | 16核 | 64GB | SSD 500GB | 万兆 |
| 数据节点 | 32核 | 128GB | HDD 12x4TB | 万兆 |
| Kafka 节点 | 16核 | 64GB | SSD 4x2TB | 万兆 |
| OLAP 节点 | 32核 | 256GB | SSD 8x2TB | 万兆 |
Hadoop 集群部署
环境准备
bash
# 所有节点执行
# 1. 关闭防火墙
systemctl stop firewalld
systemctl disable firewalld
# 2. 关闭 SELinux
setenforce 0
sed -i 's/SELINUX=enforcing/SELINUX=disabled/' /etc/selinux/config
# 3. 配置 hosts
cat >> /etc/hosts << EOF
192.168.1.10 nn1
192.168.1.11 nn2
192.168.1.12 zk1
192.168.1.13 zk2
192.168.1.14 zk3
192.168.1.20 dn1
192.168.1.21 dn2
192.168.1.22 dn3
EOF
# 4. 配置 SSH 免密登录
ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa
ssh-copy-id hadoop@nn1
ssh-copy-id hadoop@nn2
# ... 所有节点
# 5. 安装 JDK
yum install -y java-1.8.0-openjdk-devel
echo 'export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk' >> /etc/profile
source /etc/profile
# 6. 时钟同步
yum install -y ntpdate
ntpdate ntp.aliyun.com
echo "*/5 * * * * ntpdate ntp.aliyun.com" >> /var/spool/cron/rootHadoop 配置
xml
<!-- core-site.xml -->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://mycluster</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/data/hadoop/tmp</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>zk1:2181,zk2:2181,zk3:2181</value>
</property>
</configuration>xml
<!-- hdfs-site.xml(HA 配置) -->
<configuration>
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>nn1:9000</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>nn2:9000</value>
</property>
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://zk1:8485;zk2:8485;zk3:8485/mycluster</value>
</property>
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.blocksize</name>
<value>134217728</value>
</property>
</configuration>初始化与启动
bash
# 1. 格式化 ZooKeeper(只在 nn1 执行一次)
hdfs zkfc -formatZK
# 2. 启动 JournalNode(在所有 JN 节点)
hadoop-daemon.sh start journalnode
# 3. 格式化 NameNode(只在 nn1 执行一次)
hdfs namenode -format
# 4. 启动 Active NameNode
hadoop-daemon.sh start namenode
# 5. 同步到 Standby NameNode(在 nn2 执行)
hdfs namenode -bootstrapStandby
# 6. 启动所有服务
start-dfs.sh
start-yarn.sh
# 7. 验证
hdfs dfsadmin -report
yarn node -listKafka 集群部署
bash
# 下载并解压
wget https://downloads.apache.org/kafka/3.6.0/kafka_2.13-3.6.0.tgz
tar -xzf kafka_2.13-3.6.0.tgz -C /opt/
ln -s /opt/kafka_2.13-3.6.0 /opt/kafkaproperties
# config/server.properties(每个 Broker 不同的配置)
# Broker ID(每个节点唯一)
broker.id=1
# 监听地址
listeners=PLAINTEXT://kafka1:9092
advertised.listeners=PLAINTEXT://kafka1:9092
# 日志目录(多磁盘)
log.dirs=/data1/kafka,/data2/kafka,/data3/kafka
# Zookeeper 地址
zookeeper.connect=zk1:2181,zk2:2181,zk3:2181/kafka
# 消息保留
log.retention.hours=168
log.segment.bytes=1073741824
# 副本配置
default.replication.factor=3
min.insync.replicas=2
num.partitions=6
# 性能配置
num.network.threads=8
num.io.threads=16
socket.send.buffer.bytes=102400
socket.receive.buffer.bytes=102400bash
# 启动 Kafka
/opt/kafka/bin/kafka-server-start.sh -daemon /opt/kafka/config/server.properties
# 验证
/opt/kafka/bin/kafka-topics.sh --list --bootstrap-server kafka1:9092Docker Compose 快速部署(开发环境)
yaml
# docker-compose.yml
version: '3.8'
services:
zookeeper:
image: confluentinc/cp-zookeeper:7.5.0
environment:
ZOOKEEPER_CLIENT_PORT: 2181
ports:
- "2181:2181"
kafka:
image: confluentinc/cp-kafka:7.5.0
depends_on:
- zookeeper
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
ports:
- "9092:9092"
hadoop-namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
environment:
- CLUSTER_NAME=test
env_file:
- ./hadoop.env
ports:
- "9870:9870"
- "9000:9000"
volumes:
- hadoop_namenode:/hadoop/dfs/name
hadoop-datanode:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
depends_on:
- hadoop-namenode
env_file:
- ./hadoop.env
volumes:
- hadoop_datanode:/hadoop/dfs/data
hive-metastore:
image: bde2020/hive:2.3.2-postgresql-metastore
env_file:
- ./hadoop.env
depends_on:
- hadoop-namenode
ports:
- "9083:9083"
spark-master:
image: bde2020/spark-master:3.3.0-hadoop3.3
ports:
- "8080:8080"
- "7077:7077"
spark-worker:
image: bde2020/spark-worker:3.3.0-hadoop3.3
depends_on:
- spark-master
environment:
- SPARK_MASTER=spark://spark-master:7077
volumes:
hadoop_namenode:
hadoop_datanode:日常运维
扩容 DataNode
bash
# 1. 在新节点安装 Hadoop,配置相同的 hdfs-site.xml
# 2. 启动 DataNode
hadoop-daemon.sh start datanode
# 3. 触发数据均衡
hdfs balancer -threshold 10
# 4. 验证
hdfs dfsadmin -report磁盘故障处理
bash
# 查看坏盘
hdfs dfsadmin -report | grep "Bad volumes"
# 下线故障 DataNode
hdfs dfsadmin -decommission dn3
# 等待数据迁移完成
hdfs dfsadmin -report | grep "Under replicated"
# 确认完成后停止 DataNode
hadoop-daemon.sh stop datanode定期维护任务
bash
# 每周:HDFS 均衡
hdfs balancer -threshold 10
# 每月:清理 Hive 临时文件
hdfs dfs -rm -r /tmp/hive/*
# 每天:检查 HDFS 健康状态
hdfs fsck / -list-corruptfileblocks
# 每天:检查 Kafka 消费延迟
kafka-consumer-groups.sh --bootstrap-server kafka:9092 \
--describe --all-groups | grep -v "0$"