Overriding Properties

Let us understand how we can override the properties while running hdfs dfs or hadoop fs commands.

  • We can change any property which is not defined as final in core-site.xml or hdfs-site.xml.

  • We can change blocksize as well as replication while copying the files. We can also change them after copying the files as well.

  • We can either pass individual properties using -D or bunch of properties by passing xml similar to core-site.xml or hdfs-site.xml as part of --conf.

  • Let’s copy a file /data/crime/csv/rows.csv with default values. The file is splitted into 12 blocks with 2 copies each (as our default blocksize is 128 MB and replication factor is 2).

%%sh

hdfs dfs -ls /user/${USER}/crime
Found 1 items
drwxr-xr-x   - itversity students          0 2021-01-28 17:03 /user/itversity/crime/csv
%%sh

hdfs dfs -rm -R -skipTrash /user/${USER}/crime
Deleted /user/itversity/crime
%%sh

hdfs dfs -mkdir -p /user/${USER}/crime/csv
%%sh

ls -lhtr /data/crime/csv
total 1.5G
-rw-r--r-- 1 root root 1.5G Aug  8  2017 rows.csv
%%sh

hdfs dfs -put /data/crime/csv/rows.csv /user/${USER}/crime/csv
%%sh

hdfs dfs -stat %r /user/${USER}/crime/csv/rows.csv
2
%%sh

hdfs dfs -stat %o /user/${USER}/crime/csv/rows.csv
134217728
%%sh

hdfs dfs -stat %b /user/${USER}/crime/csv/rows.csv
1505540526
%%sh

hdfs dfs -rm -R -skipTrash /user/${USER}/crime/csv/rows.csv
Deleted /user/itversity/crime/csv/rows.csv
%%sh

hdfs dfs -Ddfs.blocksize=64M -Ddfs.replication=3 -put /data/crime/csv/rows.csv /user/${USER}/crime/csv
%%sh

hdfs dfs -stat %r /user/${USER}/crime/csv/rows.csv
3
%%sh

hdfs dfs -stat %o /user/${USER}/crime/csv/rows.csv
67108864
%%sh

hdfs dfs -stat %b /user/${USER}/crime/csv/rows.csv
1505540526
%%sh

ls -ltr /etc/hadoop/conf/
total 196
-rw-r--r-- 1 root   root    2250 May 11  2018 yarn-env.cmd
-rw-r--r-- 1 mapred hadoop  2697 May 11  2018 ssl-server.xml.example
-rw-r--r-- 1 mapred hadoop  2316 May 11  2018 ssl-client.xml.example
-rw-r--r-- 1 root   root     758 May 11  2018 mapred-site.xml.template
-rw-r--r-- 1 root   root    4113 May 11  2018 mapred-queues.xml.template
-rw-r--r-- 1 root   root     951 May 11  2018 mapred-env.cmd
-rw-r--r-- 1 root   root    5511 May 11  2018 kms-site.xml
-rw-r--r-- 1 root   root    1631 May 11  2018 kms-log4j.properties
-rw-r--r-- 1 root   root    1527 May 11  2018 kms-env.sh
-rw-r--r-- 1 root   root    3518 May 11  2018 kms-acls.xml
-rw-r--r-- 1 root   root    2490 May 11  2018 hadoop-metrics.properties
-rw-r--r-- 1 root   root    3979 May 11  2018 hadoop-env.cmd
-rw-r--r-- 1 hdfs   hadoop  1335 May 11  2018 configuration.xsl
-rw-r--r-- 1 hdfs   hadoop  1308 Mar  3  2020 hadoop-policy.xml
-rw-r--r-- 1 hdfs   hadoop   884 Mar  3  2020 ssl-client.xml
drwxr-xr-x 2 root   hadoop  4096 Mar  3  2020 secure
-rw-r--r-- 1 hdfs   hadoop  1000 Mar  3  2020 ssl-server.xml
-rw-r--r-- 1 hdfs   hadoop  6531 Mar  3  2020 hdfs-site.xml
-rw-r--r-- 1 hdfs   root      96 Mar  3  2020 slaves
-rw-r--r-- 1 mapred hadoop  6984 Mar  3  2020 mapred-site.xml
-rw-r--r-- 1 hdfs   hadoop  2135 Mar  3  2020 capacity-scheduler.xml
-rwxr-xr-x 1 yarn   hadoop  5359 Mar  3  2020 yarn-env.sh
-rw-r--r-- 1 root   hadoop  1019 Mar  3  2020 container-executor.cfg
-rwxr-xr-x 1 hdfs   root     818 Mar  3  2020 mapred-env.sh
-rw-r--r-- 1 hdfs   root     945 Mar  3  2020 taskcontroller.cfg
-rw-r--r-- 1 hdfs   root    1020 Mar  3  2020 commons-logging.properties
-rw-r--r-- 1 hdfs   root    1602 Mar  3  2020 health_check
-rw-r--r-- 1 hdfs   hadoop  2263 Mar  3  2020 hadoop-metrics2.properties
-rwxr-xr-x 1 root   root    4221 Mar  3  2020 task-log4j.properties
-rw-r--r-- 1 hdfs   hadoop   319 Mar  3  2020 topology_mappings.data
-rwxr-xr-x 1 root   root    2358 Mar  3  2020 topology_script.py
-rw-r--r-- 1 hdfs   hadoop 10495 May  6  2020 log4j.properties
-rw-r--r-- 1 yarn   hadoop 18438 Dec 14 02:11 yarn-site.xml
-rw-r--r-- 1 hdfs   hadoop  5705 Dec 14 02:40 hadoop-env.sh
-rw-r--r-- 1 hdfs   hadoop  4990 Jan  8 11:20 core-site.xml
%%sh

cat /etc/hadoop/conf/hdfs-site.xml
  <configuration>
    
    <property>
      <name>dfs.block.access.token.enable</name>
      <value>true</value>
    </property>
    
    <property>
      <name>dfs.blockreport.initialDelay</name>
      <value>120</value>
    </property>
    
    <property>
      <name>dfs.blocksize</name>
      <value>134217728</value>
    </property>
    
    <property>
      <name>dfs.client.read.shortcircuit</name>
      <value>true</value>
    </property>
    
    <property>
      <name>dfs.client.read.shortcircuit.streams.cache.size</name>
      <value>4096</value>
    </property>
    
    <property>
      <name>dfs.client.retry.policy.enabled</name>
      <value>false</value>
    </property>
    
    <property>
      <name>dfs.cluster.administrators</name>
      <value> hdfs</value>
    </property>
    
    <property>
      <name>dfs.content-summary.limit</name>
      <value>5000</value>
    </property>
    
    <property>
      <name>dfs.datanode.address</name>
      <value>0.0.0.0:50010</value>
    </property>
    
    <property>
      <name>dfs.datanode.balance.bandwidthPerSec</name>
      <value>6250000</value>
    </property>
    
    <property>
      <name>dfs.datanode.data.dir</name>
      <value>/hdp01/hadoop/hdfs/data,/hdp02/hadoop/hdfs/data</value>
    </property>
    
    <property>
      <name>dfs.datanode.data.dir.perm</name>
      <value>750</value>
    </property>
    
    <property>
      <name>dfs.datanode.du.reserved</name>
      <value>2563350016</value>
    </property>
    
    <property>
      <name>dfs.datanode.failed.volumes.tolerated</name>
      <value>0</value>
    </property>
    
    <property>
      <name>dfs.datanode.http.address</name>
      <value>0.0.0.0:50075</value>
    </property>
    
    <property>
      <name>dfs.datanode.https.address</name>
      <value>0.0.0.0:50475</value>
    </property>
    
    <property>
      <name>dfs.datanode.ipc.address</name>
      <value>0.0.0.0:8010</value>
    </property>
    
    <property>
      <name>dfs.datanode.max.transfer.threads</name>
      <value>4096</value>
    </property>
    
    <property>
      <name>dfs.domain.socket.path</name>
      <value>/var/lib/hadoop-hdfs/dn_socket</value>
    </property>
    
    <property>
      <name>dfs.encrypt.data.transfer.cipher.suites</name>
      <value>AES/CTR/NoPadding</value>
    </property>
    
    <property>
      <name>dfs.heartbeat.interval</name>
      <value>3</value>
    </property>
    
    <property>
      <name>dfs.hosts.exclude</name>
      <value>/etc/hadoop/conf/dfs.exclude</value>
    </property>
    
    <property>
      <name>dfs.http.policy</name>
      <value>HTTP_ONLY</value>
    </property>
    
    <property>
      <name>dfs.https.port</name>
      <value>50470</value>
    </property>
    
    <property>
      <name>dfs.journalnode.edits.dir</name>
      <value>/hadoop/hdfs/journalnode</value>
    </property>
    
    <property>
      <name>dfs.journalnode.http-address</name>
      <value>0.0.0.0:8480</value>
    </property>
    
    <property>
      <name>dfs.journalnode.https-address</name>
      <value>0.0.0.0:8481</value>
    </property>
    
    <property>
      <name>dfs.namenode.accesstime.precision</name>
      <value>3600000</value>
    </property>
    
    <property>
      <name>dfs.namenode.audit.log.async</name>
      <value>true</value>
    </property>
    
    <property>
      <name>dfs.namenode.avoid.read.stale.datanode</name>
      <value>true</value>
    </property>
    
    <property>
      <name>dfs.namenode.avoid.write.stale.datanode</name>
      <value>true</value>
    </property>
    
    <property>
      <name>dfs.namenode.checkpoint.dir</name>
      <value>/var/hadoop/hdfs/namesecondary</value>
    </property>
    
    <property>
      <name>dfs.namenode.checkpoint.edits.dir</name>
      <value>${dfs.namenode.checkpoint.dir}</value>
    </property>
    
    <property>
      <name>dfs.namenode.checkpoint.period</name>
      <value>21600</value>
    </property>
    
    <property>
      <name>dfs.namenode.checkpoint.txns</name>
      <value>1000000</value>
    </property>
    
    <property>
      <name>dfs.namenode.fslock.fair</name>
      <value>false</value>
    </property>
    
    <property>
      <name>dfs.namenode.handler.count</name>
      <value>200</value>
    </property>
    
    <property>
      <name>dfs.namenode.http-address</name>
      <value>172.16.1.101:50070</value>
    </property>
    
    <property>
      <name>dfs.namenode.https-address</name>
      <value>nn01.itversity.com:50470</value>
    </property>
    
    <property>
      <name>dfs.namenode.name.dir</name>
      <value>/hdp01/hadoop/hdfs/namenode,/hdp02/hadoop/hdfs/namenode</value>
    </property>
    
    <property>
      <name>dfs.namenode.name.dir.restore</name>
      <value>true</value>
    </property>
    
    <property>
      <name>dfs.namenode.rpc-address</name>
      <value>nn01.itversity.com:8020</value>
    </property>
    
    <property>
      <name>dfs.namenode.safemode.threshold-pct</name>
      <value>1</value>
    </property>
    
    <property>
      <name>dfs.namenode.secondary.http-address</name>
      <value>nn02.itversity.com:50090</value>
    </property>
    
    <property>
      <name>dfs.namenode.stale.datanode.interval</name>
      <value>30000</value>
    </property>
    
    <property>
      <name>dfs.namenode.startup.delay.block.deletion.sec</name>
      <value>3600</value>
    </property>
    
    <property>
      <name>dfs.namenode.write.stale.datanode.ratio</name>
      <value>1.0f</value>
    </property>
    
    <property>
      <name>dfs.permissions.enabled</name>
      <value>true</value>
    </property>
    
    <property>
      <name>dfs.permissions.superusergroup</name>
      <value>hdfs</value>
    </property>
    
    <property>
      <name>dfs.replication</name>
      <value>2</value>
    </property>
    
    <property>
      <name>dfs.replication.max</name>
      <value>50</value>
    </property>
    
    <property>
      <name>dfs.support.append</name>
      <value>true</value>
    </property>
    
    <property>
      <name>dfs.webhdfs.enabled</name>
      <value>true</value>
    </property>
    
    <property>
      <name>fs.permissions.umask-mode</name>
      <value>022</value>
    </property>
    
    <property>
      <name>nfs.exports.allowed.hosts</name>
      <value>* rw</value>
    </property>
    
    <property>
      <name>nfs.file.dump.dir</name>
      <value>/tmp/.hdfs-nfs</value>
    </property>
    
  </configuration>