Overriding Properties¶
Let us understand how we can override the properties while running hdfs dfs
or hadoop fs
commands.
We can change any property which is not defined as final in core-site.xml or hdfs-site.xml.
We can change
blocksize
as well asreplication
while copying the files. We can also change them after copying the files as well.We can either pass individual properties using
-D
or bunch of properties by passing xml similar to core-site.xml or hdfs-site.xml as part of--conf
.Let’s copy a file /data/crime/csv/rows.csv with default values. The file is splitted into 12 blocks with 2 copies each (as our default blocksize is 128 MB and replication factor is 2).
%%sh
hdfs dfs -ls /user/${USER}/crime
Found 1 items
drwxr-xr-x - itversity students 0 2021-01-28 17:03 /user/itversity/crime/csv
%%sh
hdfs dfs -rm -R -skipTrash /user/${USER}/crime
Deleted /user/itversity/crime
%%sh
hdfs dfs -mkdir -p /user/${USER}/crime/csv
%%sh
ls -lhtr /data/crime/csv
total 1.5G
-rw-r--r-- 1 root root 1.5G Aug 8 2017 rows.csv
%%sh
hdfs dfs -put /data/crime/csv/rows.csv /user/${USER}/crime/csv
%%sh
hdfs dfs -stat %r /user/${USER}/crime/csv/rows.csv
2
%%sh
hdfs dfs -stat %o /user/${USER}/crime/csv/rows.csv
134217728
%%sh
hdfs dfs -stat %b /user/${USER}/crime/csv/rows.csv
1505540526
%%sh
hdfs dfs -rm -R -skipTrash /user/${USER}/crime/csv/rows.csv
Deleted /user/itversity/crime/csv/rows.csv
%%sh
hdfs dfs -Ddfs.blocksize=64M -Ddfs.replication=3 -put /data/crime/csv/rows.csv /user/${USER}/crime/csv
%%sh
hdfs dfs -stat %r /user/${USER}/crime/csv/rows.csv
3
%%sh
hdfs dfs -stat %o /user/${USER}/crime/csv/rows.csv
67108864
%%sh
hdfs dfs -stat %b /user/${USER}/crime/csv/rows.csv
1505540526
%%sh
ls -ltr /etc/hadoop/conf/
total 196
-rw-r--r-- 1 root root 2250 May 11 2018 yarn-env.cmd
-rw-r--r-- 1 mapred hadoop 2697 May 11 2018 ssl-server.xml.example
-rw-r--r-- 1 mapred hadoop 2316 May 11 2018 ssl-client.xml.example
-rw-r--r-- 1 root root 758 May 11 2018 mapred-site.xml.template
-rw-r--r-- 1 root root 4113 May 11 2018 mapred-queues.xml.template
-rw-r--r-- 1 root root 951 May 11 2018 mapred-env.cmd
-rw-r--r-- 1 root root 5511 May 11 2018 kms-site.xml
-rw-r--r-- 1 root root 1631 May 11 2018 kms-log4j.properties
-rw-r--r-- 1 root root 1527 May 11 2018 kms-env.sh
-rw-r--r-- 1 root root 3518 May 11 2018 kms-acls.xml
-rw-r--r-- 1 root root 2490 May 11 2018 hadoop-metrics.properties
-rw-r--r-- 1 root root 3979 May 11 2018 hadoop-env.cmd
-rw-r--r-- 1 hdfs hadoop 1335 May 11 2018 configuration.xsl
-rw-r--r-- 1 hdfs hadoop 1308 Mar 3 2020 hadoop-policy.xml
-rw-r--r-- 1 hdfs hadoop 884 Mar 3 2020 ssl-client.xml
drwxr-xr-x 2 root hadoop 4096 Mar 3 2020 secure
-rw-r--r-- 1 hdfs hadoop 1000 Mar 3 2020 ssl-server.xml
-rw-r--r-- 1 hdfs hadoop 6531 Mar 3 2020 hdfs-site.xml
-rw-r--r-- 1 hdfs root 96 Mar 3 2020 slaves
-rw-r--r-- 1 mapred hadoop 6984 Mar 3 2020 mapred-site.xml
-rw-r--r-- 1 hdfs hadoop 2135 Mar 3 2020 capacity-scheduler.xml
-rwxr-xr-x 1 yarn hadoop 5359 Mar 3 2020 yarn-env.sh
-rw-r--r-- 1 root hadoop 1019 Mar 3 2020 container-executor.cfg
-rwxr-xr-x 1 hdfs root 818 Mar 3 2020 mapred-env.sh
-rw-r--r-- 1 hdfs root 945 Mar 3 2020 taskcontroller.cfg
-rw-r--r-- 1 hdfs root 1020 Mar 3 2020 commons-logging.properties
-rw-r--r-- 1 hdfs root 1602 Mar 3 2020 health_check
-rw-r--r-- 1 hdfs hadoop 2263 Mar 3 2020 hadoop-metrics2.properties
-rwxr-xr-x 1 root root 4221 Mar 3 2020 task-log4j.properties
-rw-r--r-- 1 hdfs hadoop 319 Mar 3 2020 topology_mappings.data
-rwxr-xr-x 1 root root 2358 Mar 3 2020 topology_script.py
-rw-r--r-- 1 hdfs hadoop 10495 May 6 2020 log4j.properties
-rw-r--r-- 1 yarn hadoop 18438 Dec 14 02:11 yarn-site.xml
-rw-r--r-- 1 hdfs hadoop 5705 Dec 14 02:40 hadoop-env.sh
-rw-r--r-- 1 hdfs hadoop 4990 Jan 8 11:20 core-site.xml
%%sh
cat /etc/hadoop/conf/hdfs-site.xml
<configuration>
<property>
<name>dfs.block.access.token.enable</name>
<value>true</value>
</property>
<property>
<name>dfs.blockreport.initialDelay</name>
<value>120</value>
</property>
<property>
<name>dfs.blocksize</name>
<value>134217728</value>
</property>
<property>
<name>dfs.client.read.shortcircuit</name>
<value>true</value>
</property>
<property>
<name>dfs.client.read.shortcircuit.streams.cache.size</name>
<value>4096</value>
</property>
<property>
<name>dfs.client.retry.policy.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.cluster.administrators</name>
<value> hdfs</value>
</property>
<property>
<name>dfs.content-summary.limit</name>
<value>5000</value>
</property>
<property>
<name>dfs.datanode.address</name>
<value>0.0.0.0:50010</value>
</property>
<property>
<name>dfs.datanode.balance.bandwidthPerSec</name>
<value>6250000</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/hdp01/hadoop/hdfs/data,/hdp02/hadoop/hdfs/data</value>
</property>
<property>
<name>dfs.datanode.data.dir.perm</name>
<value>750</value>
</property>
<property>
<name>dfs.datanode.du.reserved</name>
<value>2563350016</value>
</property>
<property>
<name>dfs.datanode.failed.volumes.tolerated</name>
<value>0</value>
</property>
<property>
<name>dfs.datanode.http.address</name>
<value>0.0.0.0:50075</value>
</property>
<property>
<name>dfs.datanode.https.address</name>
<value>0.0.0.0:50475</value>
</property>
<property>
<name>dfs.datanode.ipc.address</name>
<value>0.0.0.0:8010</value>
</property>
<property>
<name>dfs.datanode.max.transfer.threads</name>
<value>4096</value>
</property>
<property>
<name>dfs.domain.socket.path</name>
<value>/var/lib/hadoop-hdfs/dn_socket</value>
</property>
<property>
<name>dfs.encrypt.data.transfer.cipher.suites</name>
<value>AES/CTR/NoPadding</value>
</property>
<property>
<name>dfs.heartbeat.interval</name>
<value>3</value>
</property>
<property>
<name>dfs.hosts.exclude</name>
<value>/etc/hadoop/conf/dfs.exclude</value>
</property>
<property>
<name>dfs.http.policy</name>
<value>HTTP_ONLY</value>
</property>
<property>
<name>dfs.https.port</name>
<value>50470</value>
</property>
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/hadoop/hdfs/journalnode</value>
</property>
<property>
<name>dfs.journalnode.http-address</name>
<value>0.0.0.0:8480</value>
</property>
<property>
<name>dfs.journalnode.https-address</name>
<value>0.0.0.0:8481</value>
</property>
<property>
<name>dfs.namenode.accesstime.precision</name>
<value>3600000</value>
</property>
<property>
<name>dfs.namenode.audit.log.async</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.avoid.read.stale.datanode</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.avoid.write.stale.datanode</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.checkpoint.dir</name>
<value>/var/hadoop/hdfs/namesecondary</value>
</property>
<property>
<name>dfs.namenode.checkpoint.edits.dir</name>
<value>${dfs.namenode.checkpoint.dir}</value>
</property>
<property>
<name>dfs.namenode.checkpoint.period</name>
<value>21600</value>
</property>
<property>
<name>dfs.namenode.checkpoint.txns</name>
<value>1000000</value>
</property>
<property>
<name>dfs.namenode.fslock.fair</name>
<value>false</value>
</property>
<property>
<name>dfs.namenode.handler.count</name>
<value>200</value>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>172.16.1.101:50070</value>
</property>
<property>
<name>dfs.namenode.https-address</name>
<value>nn01.itversity.com:50470</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/hdp01/hadoop/hdfs/namenode,/hdp02/hadoop/hdfs/namenode</value>
</property>
<property>
<name>dfs.namenode.name.dir.restore</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.rpc-address</name>
<value>nn01.itversity.com:8020</value>
</property>
<property>
<name>dfs.namenode.safemode.threshold-pct</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>nn02.itversity.com:50090</value>
</property>
<property>
<name>dfs.namenode.stale.datanode.interval</name>
<value>30000</value>
</property>
<property>
<name>dfs.namenode.startup.delay.block.deletion.sec</name>
<value>3600</value>
</property>
<property>
<name>dfs.namenode.write.stale.datanode.ratio</name>
<value>1.0f</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.permissions.superusergroup</name>
<value>hdfs</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.replication.max</name>
<value>50</value>
</property>
<property>
<name>dfs.support.append</name>
<value>true</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<property>
<name>fs.permissions.umask-mode</name>
<value>022</value>
</property>
<property>
<name>nfs.exports.allowed.hosts</name>
<value>* rw</value>
</property>
<property>
<name>nfs.file.dump.dir</name>
<value>/tmp/.hdfs-nfs</value>
</property>
</configuration>