Introduction
This tutorial explain how to setup multi node hadoop 2 (YARN) Cluster
Prerequisites: Several machines installed with Ubuntu 16.04 server.
In this example, there are four virtual machines. One for Namenode and three for Datanode respectively.
1. Edit host file
sudo vi /etc/hosts
The /etc/hosts file on each machines looks like this
127.0.0.1 localhost
192.168.56.101 node1
192.168.56.102 node2
192.168.56.103 node3
192.168.56.104 node4
2. Install Java on all nodes
sudo apt-get install default-jdk
3. Create ssh without password to all nodes
Note: In this example, the user is node1
node1@node1:~$
ssh-keygen
ssh-copy-id node1@node1
ssh-copy-id node1@node2
ssh-copy-id node1@node3
ssh-copy-id node1@node3
4. Download Hadoop binary
wget http://apache.cbox.biz/hadoop/common/hadoop-2.7.4/hadoop-2.7.4.tar.gz
sudo tar -xvf hadoop-2.7.4.tar.gz -C /usr/local/
sudo ln -s /usr/local/hadoop-2.7.4/ /usr/local/hadoop
sudo chown -R node1:node1 /usr/local/hadoop-2.7.4/
5. Setup Environment Variables
vi ~/.profile
add at the end of the file
export JAVA_HOME=$(readlink -f /usr/bin/java | sed “s:bin/java::”)
export HADOOP_INSTALL=/usr/local/hadoop
export PATH=$PATH:$HADOOP_INSTALL/bin
export PATH=$PATH:$HADOOP_INSTALL/sbin
export HADOOP_MAPRED_HOME=$HADOOP_INSTALL
export HADOOP_COMMON_HOME=$HADOOP_INSTALL
export HADOOP_HDFS_HOME=$HADOOP_INSTALL
export YARN_HOME=$HADOOP_INSTALL
export HADOOP_HOME=$HADOOP_INSTALL
export HADOOP_CONF_DIR=${HADOOP_HOME}”/etc/hadoop”
export YARN_CONF_DIR=${HADOOP_HOME}”/etc/hadoop”
Then
source ~/.profile
Change JAVA_HOME variable value in $HADOOP_CONF_DIR/hadoop-env.sh
JAVA_HOME=$(readlink -f /usr/bin/java | sed “s:bin/java::”)
6. Configure Namenode
mkdir -pv $HADOOP_INSTALL/data/namenode
mkdir -pv $HADOOP_INSTALL/logs
vim $HADOOP_CONF_DIR/hdfs-site.xml
[xml]
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///usr/local/hadoop/data/namenode</value>
<description>NameNode directory for namespace and transaction logs storage.</description>
</property>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
<property>
<name>dfs.datanode.use.datanode.hostname</name>
<value>false</value>
</property>
<property>
<name>dfs.namenode.datanode.registration.ip-hostname-check</name>
<value>false</value>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>node1:50070</value>
<description>Your NameNode hostname for http access.</description>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>node1:50090</value>
<description>Your Secondary NameNode hostname for http access.</description>
</property>
</configuration>
[/xml]
vi $HADOOP_CONF_DIR/slaves
node2
node3
node4
7. Configure Datanodes
mkdir -pv $HADOOP_INSTALL/data/datanode
mkdir -pv $HADOOP_INSTALL/logs
vim $HADOOP_CONF_DIR/hdfs-site.xml
[xml]
<configuration>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///usr/local/hadoop/data/datanode</value>
<description>DataNode directory</description>
</property>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
<property>
<name>dfs.datanode.use.datanode.hostname</name>
<value>false</value>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>node1:50070</value>
<description>Your NameNode hostname for http access.</description>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>node1:50090</value>
<description>Your Secondary NameNode hostname for http access.</description>
</property>
</configuration>
[/xml]
8. Configuration for Namenode and Datanodes
vi $HADOOP_CONF_DIR/core-site.xml
[xml]
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://node1/</value>
<description>NameNode URI</description>
</property>
</configuration>
[/xml]
vi $HADOOP_CONF_DIR/mapred-site.xml
[xml]
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
[/xml]
vi $HADOOP_CONF_DIR/yarn-site.xml
[xml]
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>node1:8025</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>node1:8030</value>
</property>
<property>
<name>yarn.resourcemanager.address</name>
<value>node1:8050</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>7000</value>
</property>
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>2</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>1024</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>7000</value>
</property>
</configuration>
[/xml]
9. Start the cluster and verify started processes
Be on the Namenode and execute
hdfs namenode –format
start-dfs.sh
start-yarn.sh
10. Run map reduce and flink examples