Skip to content

Installation

# Download and install binaries
curl -O "https://downloads.apache.org/hadoop/common/hadoop-3.3.3/hadoop-3.3.3.tar.gz"
tar -xzvf "hadoop-3.3.3.tar.gz"

# Environment variables
echo 'export HADOOP_HOME=$HOME/hadoop' >> ~/.zshrc
echo 'export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin' >> ~/.zshrc

echo 'export HADOOP_INSTALL=$HADOOP_HOME' >> ~/.zshrc
echo 'export HADOOP_MAPRED_HOME=$HADOOP_HOME' >> ~/.zshrc
echo 'export HADOOP_COMMON_HOME=$HADOOP_HOME' >> ~/.zshrc
echo 'export HADOOP_HDFS_HOME=$HADOOP_HOME' >> ~/.zshrc
echo 'export YARN_HOME=$HADOOP_HOME' >> ~/.zshrc
echo 'export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native' >> ~/.zshrc
echo 'export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"' >> ~/.zshrc

# Testing
hadoop version

Configuration files

hadoop-env.sh

  • $HADOOP_HOME/etc/hadoop/hadoop-env.sh
# use the java 8 jdk for hadoop
echo 'export JAVA_HOME=/usr/lib/jvm/java-8-openjdk' >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh

core-site.xml

  • $HADOOP_HOME/etc/hadoop/core-site.xml
<configuration>
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://localhost:9000</value>
  </property>
</configuration>

hdfs-site.xml

  • $HADOOP_HOME/etc/hadoop/hdfs-site.xml

  • First, create the folowing directories to hold data

  • $HADOOP_HOME/data/namenode
  • $HADOOP_HOME/data/datanode
<configuration>
  <property>
    <name>dfs.replication</name>
    <value>1</value> <!-- for development, use 1 to save memory -->
  </property>
  <property>
    <name>dfs.namenode.name.dir</name>
    <value>/home/user/hadoop/data/namenode</value>
  </property>
  <property>
    <name>dfs.datanode.data.dir</name>
    <value>/home/user/hadoop/data/datanode</value>
  </property>
</configuration>

mapred-site.xml

  • $HADOOP_HOME/etc/hadoop/mapred-site.xml
<configuration>
  <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
  </property>
</configuration>

yarn-site.xml

  • $HADOOP_HOME/etc/hadoop/yarn-site.xml
<configuration>
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
  <property>
    <name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>
    <value>org.apache.hadoop.mapred.ShuffleHandler</value>
  </property>
</configuration>

HDP on Docker

# root:hadoop is the default user:pass
ssh root@localhost -p 2222