The bash code below is used for hierarchical merge of 16 CSV file (one per year, each is about 5 GB). It is based on an answer on this forum "Joining 2 CSV files based on common field name" However, it is quite slow. Any idea for improvement? Multiprocessing or so? (I have mac with 8 cores and 16 GB of RAM)
#!/bin/bash
csv2000="`ls *_2000.csv`"
csv2001="`ls *_2001.csv`"
csv2002="`ls *_2002.csv`"
csv2003="`ls *_2003.csv`"
csv2004="`ls *_2004.csv`"
csv2005="`ls *_2005.csv`"
csv2006="`ls *_2006.csv`"
csv2007="`ls *_2007.csv`"
csv2008="`ls *_2008.csv`"
csv2009="`ls *_2009.csv`"
csv2010="`ls *_2010.csv`"
csv2011="`ls *_2011.csv`"
csv2012="`ls *_2012.csv`"
csv2013="`ls *_2013.csv`"
csv2014="`ls *_2014.csv`"
csv2015="`ls *_2015.csv`"
temp01_csv="temp01.csv"
temp02_csv="temp02.csv"
temp03_csv="temp03.csv"
temp04_csv="temp04.csv"
temp05_csv="temp05.csv"
temp06_csv="temp06.csv"
temp07_csv="temp07.csv"
temp08_csv="temp08.csv"
temp09_csv="temp09.csv"
temp10_csv="temp10.csv"
temp11_csv="temp11.csv"
temp12_csv="temp12.csv"
temp13_csv="temp13.csv"
temp14_csv="temp14.csv"
temp15_csv="temp15.csv"
join -a1 -t';' $csv2000 $csv2001 > $temp01_csv
join -a1 -t';' $csv2002 $csv2003 > $temp02_csv
join -a1 -t';' $csv2004 $csv2005 > $temp03_csv
join -a1 -t';' $csv2006 $csv2007 > $temp04_csv
join -a1 -t';' $csv2008 $csv2009 > $temp05_csv
join -a1 -t';' $csv2010 $csv2011 > $temp06_csv
join -a1 -t';' $csv2012 $csv2013 > $temp07_csv
join -a1 -t';' $csv2014 $csv2015 > $temp08_csv
join -a1 -t';' $temp01_csv $temp02_csv > $temp09_csv
join -a1 -t';' $temp03_csv $temp04_csv > $temp10_csv
join -a1 -t';' $temp05_csv $temp06_csv > $temp11_csv
join -a1 -t';' $temp07_csv $temp08_csv > $temp12_csv
join -a1 -t';' $temp09_csv $temp10_csv > $temp13_csv
join -a1 -t';' $temp11_csv $temp12_csv > $temp14_csv
join -a1 -t';' $temp13_csv $temp14_csv > final.csv
#done
exit 0
Any help is appreciated