Skip to content

集群部署与运维

部署架构规划

典型生产集群规划

┌─────────────────────────────────────────────────────────────┐
│                    生产集群规划(示例)                        │
│                                                             │
│  管理节点(3台,高可用)                                      │
│  ├── NameNode(Active + Standby)                           │
│  ├── ResourceManager(Active + Standby)                    │
│  ├── HMaster                                               │
│  ├── Zookeeper(3节点)                                     │
│  └── JournalNode(3节点)                                   │
│                                                             │
│  数据节点(N台,按需扩展)                                    │
│  ├── DataNode                                              │
│  ├── NodeManager                                           │
│  └── RegionServer(HBase)                                  │
│                                                             │
│  计算节点(可与数据节点合并)                                  │
│  ├── Spark Executor                                        │
│  └── Flink TaskManager                                     │
│                                                             │
│  服务节点(独立部署)                                         │
│  ├── HiveServer2                                           │
│  ├── Kafka Broker(3台)                                    │
│  ├── Airflow(Scheduler + WebServer)                       │
│  └── 监控(Prometheus + Grafana)                           │
└─────────────────────────────────────────────────────────────┘

硬件配置建议

节点类型CPU内存磁盘网络
管理节点16核64GBSSD 500GB万兆
数据节点32核128GBHDD 12x4TB万兆
Kafka 节点16核64GBSSD 4x2TB万兆
OLAP 节点32核256GBSSD 8x2TB万兆

Hadoop 集群部署

环境准备

bash
# 所有节点执行

# 1. 关闭防火墙
systemctl stop firewalld
systemctl disable firewalld

# 2. 关闭 SELinux
setenforce 0
sed -i 's/SELINUX=enforcing/SELINUX=disabled/' /etc/selinux/config

# 3. 配置 hosts
cat >> /etc/hosts << EOF
192.168.1.10  nn1
192.168.1.11  nn2
192.168.1.12  zk1
192.168.1.13  zk2
192.168.1.14  zk3
192.168.1.20  dn1
192.168.1.21  dn2
192.168.1.22  dn3
EOF

# 4. 配置 SSH 免密登录
ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa
ssh-copy-id hadoop@nn1
ssh-copy-id hadoop@nn2
# ... 所有节点

# 5. 安装 JDK
yum install -y java-1.8.0-openjdk-devel
echo 'export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk' >> /etc/profile
source /etc/profile

# 6. 时钟同步
yum install -y ntpdate
ntpdate ntp.aliyun.com
echo "*/5 * * * * ntpdate ntp.aliyun.com" >> /var/spool/cron/root

Hadoop 配置

xml
<!-- core-site.xml -->
<configuration>
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://mycluster</value>
  </property>
  <property>
    <name>hadoop.tmp.dir</name>
    <value>/data/hadoop/tmp</value>
  </property>
  <property>
    <name>ha.zookeeper.quorum</name>
    <value>zk1:2181,zk2:2181,zk3:2181</value>
  </property>
</configuration>
xml
<!-- hdfs-site.xml(HA 配置) -->
<configuration>
  <property>
    <name>dfs.nameservices</name>
    <value>mycluster</value>
  </property>
  <property>
    <name>dfs.ha.namenodes.mycluster</name>
    <value>nn1,nn2</value>
  </property>
  <property>
    <name>dfs.namenode.rpc-address.mycluster.nn1</name>
    <value>nn1:9000</value>
  </property>
  <property>
    <name>dfs.namenode.rpc-address.mycluster.nn2</name>
    <value>nn2:9000</value>
  </property>
  <property>
    <name>dfs.namenode.shared.edits.dir</name>
    <value>qjournal://zk1:8485;zk2:8485;zk3:8485/mycluster</value>
  </property>
  <property>
    <name>dfs.ha.automatic-failover.enabled</name>
    <value>true</value>
  </property>
  <property>
    <name>dfs.replication</name>
    <value>3</value>
  </property>
  <property>
    <name>dfs.blocksize</name>
    <value>134217728</value>
  </property>
</configuration>

初始化与启动

bash
# 1. 格式化 ZooKeeper(只在 nn1 执行一次)
hdfs zkfc -formatZK

# 2. 启动 JournalNode(在所有 JN 节点)
hadoop-daemon.sh start journalnode

# 3. 格式化 NameNode(只在 nn1 执行一次)
hdfs namenode -format

# 4. 启动 Active NameNode
hadoop-daemon.sh start namenode

# 5. 同步到 Standby NameNode(在 nn2 执行)
hdfs namenode -bootstrapStandby

# 6. 启动所有服务
start-dfs.sh
start-yarn.sh

# 7. 验证
hdfs dfsadmin -report
yarn node -list

Kafka 集群部署

bash
# 下载并解压
wget https://downloads.apache.org/kafka/3.6.0/kafka_2.13-3.6.0.tgz
tar -xzf kafka_2.13-3.6.0.tgz -C /opt/
ln -s /opt/kafka_2.13-3.6.0 /opt/kafka
properties
# config/server.properties(每个 Broker 不同的配置)

# Broker ID(每个节点唯一)
broker.id=1

# 监听地址
listeners=PLAINTEXT://kafka1:9092
advertised.listeners=PLAINTEXT://kafka1:9092

# 日志目录(多磁盘)
log.dirs=/data1/kafka,/data2/kafka,/data3/kafka

# Zookeeper 地址
zookeeper.connect=zk1:2181,zk2:2181,zk3:2181/kafka

# 消息保留
log.retention.hours=168
log.segment.bytes=1073741824

# 副本配置
default.replication.factor=3
min.insync.replicas=2
num.partitions=6

# 性能配置
num.network.threads=8
num.io.threads=16
socket.send.buffer.bytes=102400
socket.receive.buffer.bytes=102400
bash
# 启动 Kafka
/opt/kafka/bin/kafka-server-start.sh -daemon /opt/kafka/config/server.properties

# 验证
/opt/kafka/bin/kafka-topics.sh --list --bootstrap-server kafka1:9092

Docker Compose 快速部署(开发环境)

yaml
# docker-compose.yml
version: '3.8'

services:
  zookeeper:
    image: confluentinc/cp-zookeeper:7.5.0
    environment:
      ZOOKEEPER_CLIENT_PORT: 2181
    ports:
      - "2181:2181"

  kafka:
    image: confluentinc/cp-kafka:7.5.0
    depends_on:
      - zookeeper
    environment:
      KAFKA_BROKER_ID: 1
      KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092
      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
    ports:
      - "9092:9092"

  hadoop-namenode:
    image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
    environment:
      - CLUSTER_NAME=test
    env_file:
      - ./hadoop.env
    ports:
      - "9870:9870"
      - "9000:9000"
    volumes:
      - hadoop_namenode:/hadoop/dfs/name

  hadoop-datanode:
    image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
    depends_on:
      - hadoop-namenode
    env_file:
      - ./hadoop.env
    volumes:
      - hadoop_datanode:/hadoop/dfs/data

  hive-metastore:
    image: bde2020/hive:2.3.2-postgresql-metastore
    env_file:
      - ./hadoop.env
    depends_on:
      - hadoop-namenode
    ports:
      - "9083:9083"

  spark-master:
    image: bde2020/spark-master:3.3.0-hadoop3.3
    ports:
      - "8080:8080"
      - "7077:7077"

  spark-worker:
    image: bde2020/spark-worker:3.3.0-hadoop3.3
    depends_on:
      - spark-master
    environment:
      - SPARK_MASTER=spark://spark-master:7077

volumes:
  hadoop_namenode:
  hadoop_datanode:

日常运维

扩容 DataNode

bash
# 1. 在新节点安装 Hadoop,配置相同的 hdfs-site.xml
# 2. 启动 DataNode
hadoop-daemon.sh start datanode

# 3. 触发数据均衡
hdfs balancer -threshold 10

# 4. 验证
hdfs dfsadmin -report

磁盘故障处理

bash
# 查看坏盘
hdfs dfsadmin -report | grep "Bad volumes"

# 下线故障 DataNode
hdfs dfsadmin -decommission dn3

# 等待数据迁移完成
hdfs dfsadmin -report | grep "Under replicated"

# 确认完成后停止 DataNode
hadoop-daemon.sh stop datanode

定期维护任务

bash
# 每周:HDFS 均衡
hdfs balancer -threshold 10

# 每月:清理 Hive 临时文件
hdfs dfs -rm -r /tmp/hive/*

# 每天:检查 HDFS 健康状态
hdfs fsck / -list-corruptfileblocks

# 每天:检查 Kafka 消费延迟
kafka-consumer-groups.sh --bootstrap-server kafka:9092 \
  --describe --all-groups | grep -v "0$"

本站内容由 褚成志 整理编写,仅供学习参考