I am trying to parse a huge file ( arround 13 GB ) and transpose it in a csv ( can also transpose it in two or three ). The file has the records on one line that is why it has arround 500.000.000 rows.Also, the attributes may vary from one record to another- some columns may appear and some may. I came up with a shell script for transposing it but it takes 12 minutes to process 1.000.000 rows, so it would take 100 hours to parse the hole file.
The shell script is the following:
#############################################
# Define the usage
#############################################
gUsage="
usage: %> `basename $0` <Run Date> <Input path> <Preprocessing path> <filename>
where
Input path: Generic folder where the input file is for transposing
Preprocessing path: Generic folder where the processed file will be moved
filename: Template for filename
"
ls_current_date=`date +'%Y-%m-%d'`
ls_current_time=`date +'%H%M%S'`
ls_run_name="${ls_current_date}"_"${ls_current_time}"
i=-1
j=0
d=-1
# Check number of parameters
if [ $# -ne 4 ]; then
echo ""
echo "ERROR: Expecting 4 parameters"
echo "$gUsage"
exit
fi
ls_current_date=`date +'%Y-%m-%d'`
ls_current_time=`date +'%H%M%S'`
ls_run_name="${ls_current_date}"_"${ls_current_time}"
#############################################
# VN Declare & Check User Parameters + input files existence
#############################################
p_InputPath=$2
p_PreprocessingPath=$3
p_filename=$4
echo "Start time : $ls_run_name " > "${p_PreprocessingPath}/log.txt"
echo " Starting the transposing process..." >> "${p_PreprocessingPath}/log.txt"
echo " " >> "${p_PreprocessingPath}/log.txt"
echo " " >> "${p_PreprocessingPath}/log.txt"
### Parameter 1 is the Run Date will test for TODAY (today's date in the format YYYY-MM-DD)
if [ "$1" -eq "TODAY" ]; then
p_Rundate=`date +'%Y-%m-%d'`
else
p_Rundate=$1
fi
echo "*************************************************************"
echo "Checking File Existence"
echo "*************************************************************"
ODSM_FILE="$p_InputPath/$p_filename"
if [ -f $ODSM_FILE ];
then
echo "Source file ODSM found: $ODSM_FILE !"
else
echo "ERROR: source file ODSM_FILE does not exist or does not match the pattern $ODSM_FILE."
exit
fi
#Define the header of the file
header="entry-id;kmMsisdn;serialNumber;kmSubscriptionType;kmSubscriptionType2;kmVoiceTan;kmDataTan;kmPaymentMethod;kmMccsDate;kmCustomerBlocked;kmNetworkOperatorBlocked;kmBlockedNetwork;kmMmpNoStatus;kmMmpM3cCreditLimit;kmMmpM3cStatus;kmMmpM3cStatusDate;kmMmpM3cRegistrationDate;creatorsName;createTimestamp;modifiersName;modifyTimestamp;kmBrandName;objectClass;cn;kmBlockedServices;kmServiceProvider"
delimiter=";"
number_col=$(grep -o "$delimiter" <<< "$header" | wc -l)
number_col2=`expr "$number_col + 1" | bc`
#Create the new file
v=$(basename $p_filename)
name=${v%.*}
extension=${v#*.}
p_shortFileName=$name
#Insert Header in file
p_newFileName="${p_PreprocessingPath}/${p_shortFileName}_Transposed.csv"
echo $header > $p_newFileName
#Create the matrix with the columns and their values
declare -A a
#Parse line by line the file
while read -r line;
do
var=$line
#echo $line
Column_Name=${var%:*}
Column_Value=${var#*:}
var="# entry-id"
if [[ "$Column_Name" == "$var" && $Column_Value -ne 1 ]];
then
((i++))
if [ $i -gt 0 ];
then
z=$(($i-1))
#Write the previous loaded record
echo ${a[$z,0]} ${a[$z,1]} ${a[$z,2]} ${a[$z,3]} ${a[$z,4]} ${a[$z,5]} ${a[$z,6]} ${a[$z,7]} ${a[$z,8]} ${a[$z,9]} ${a[$z,10]} ${a[$z,11]} ${a[$z,12]} ${a[$z,13]} ${a[$z,14]} ${a[$z,15]} ${a[$z,16]} ${a[$z,17]} ${a[$z,18]} ${a[$z,19]} ${a[$z,20]} ${a[$z,21]} ${a[$z,22]} ${a[$z,23]} ${a[$z,24]} ${a[$z,25]} >> $p_newFileName
fi
c=0
a[$i,0]=";"
a[$i,1]=";"
a[$i,2]=";"
a[$i,3]=";"
a[$i,4]=";"
a[$i,5]=";"
a[$i,6]=";"
a[$i,7]=";"
a[$i,8]=";"
a[$i,9]=";"
a[$i,10]=";"
a[$i,11]=";"
a[$i,12]=";"
a[$i,13]=";"
a[$i,14]=";"
a[$i,15]=";"
a[$i,16]=";"
a[$i,17]=";"
a[$i,18]=";"
a[$i,19]=";"
a[$i,20]=";"
a[$i,21]=";"
a[$i,22]=";"
a[$i,23]=";"
a[$i,24]=";"
a[$i,25]=";"
a[$i,26]=" "
a[$i,0]="$Column_Value ;"
#v[$i]=$i
elif [[ $Column_Name == "kmMsisdn" && $i -gt -1 ]];
then
a[$i,1]="$Column_Value ;"
elif [[ $Column_Name == "serialNumber" && $i -gt -1 ]];
then
a[$i,2]="$Column_Value ;"
elif [[ $Column_Name == "kmSubscriptionType" && $i -gt -1 ]];
then
a[$i,3]="$Column_Value ;"
elif [[ $Column_Name == "kmSubscriptionType2" && $i -gt -1 ]];
then
a[$i,4]="$Column_Value ;"
elif [[ $Column_Name == "kmVoiceTan" && $i -gt -1 ]];
then
a[$i,5]="$Column_Value ;"
elif [[ $Column_Name == "kmDataTan" && $i -gt -1 ]];
then
a[$i,6]="$Column_Value ;"
elif [[ $Column_Name == "kmPaymentMethod" && $i -gt -1 ]];
then
a[$i,7]="$Column_Value ;"
elif [[ $Column_Name == "kmMccsDate" && $i -gt -1 ]];
then
a[$i,8]="$Column_Value ;"
elif [[ $Column_Name == "kmCustomerBlocked" && $i -gt -1 ]];
then
a[$i,9]="$Column_Value ;"
elif [[ $Column_Name == "kmNetworkOperatorBlocked" && $i -gt -1 ]];
then
a[$i,10]="$Column_Value ;"
elif [[ $Column_Name == "kmBlockedNetwork" && $i -gt -1 ]];
then
a[$i,11]="$Column_Value ;"
elif [[ $Column_Name == "kmMmpNoStatus" && $i -gt -1 ]];
then
a[$i,12]="$Column_Value ;"
elif [[ $Column_Name == "kmMmpM3cCreditLimit" && $i -gt -1 ]];
then
a[$i,13]="$Column_Value ;"
elif [[ $Column_Name == "kmMmpM3cStatus" && $i -gt -1 ]];
then
a[$i,14]="$Column_Value ;"
elif [[ $Column_Name == "kmMmpM3cStatusDate" && $i -gt -1 ]];
then
a[$i,15]="$Column_Value ;"
elif [[ $Column_Name == "kmMmpM3cRegistrationDate" && $i -gt -1 ]];
then
a[$i,16]="$Column_Value ;"
elif [[ $Column_Name == "creatorsName" && $i -gt -1 ]];
then
a[$i,17]="$Column_Value ;"
elif [[ $Column_Name == "createTimestamp" && $i -gt -1 ]];
then
a[$i,18]="$Column_Value ;"
elif [[ $Column_Name == "modifiersName" && $i -gt -1 ]];
then
a[$i,19]="$Column_Value ;"
elif [[ $Column_Name == "modifyTimestamp" && $i -gt -1 ]];
then
a[$i,20]="$Column_Value ;"
elif [[ $Column_Name == "kmBrandName" && $i -gt -1 ]];
then
a[$i,21]="$Column_Value ;"
elif [[ $Column_Name == "objectClass" && $i -gt -1 ]];
then
if [ $c -eq 0 ];
then
a[$i,22]="$Column_Value ;"
((c++))
else
a[$i,22]="$Column_Value+${a[$i,22]}"
((c++))
fi
elif [[ $Column_Name == "cn" && $i -gt -1 ]];
then
a[$i,23]="$Column_Value ;"
elif [[ $Column_Name == "kmBlockedServices" && $i -gt -1 ]];
then
a[$i,24]="$Column_Value ;"
elif [[ $Column_Name == "kmServiceProvider" && $i -gt -1 ]];
then
a[$i,25]="$Column_Value "
fi
done < $ODSM_FILE
#Write the last line of the matrix
echo ${a[$i,0]} ${a[$i,1]} ${a[$i,2]} ${a[$i,3]} ${a[$i,4]} ${a[$i,5]} ${a[$i,6]} ${a[$i,7]} ${a[$i,8]} ${a[$i,9]} ${a[$i,10]} ${a[$i,11]} ${a[$i,12]} ${a[$i,13]} ${a[$i,14]} ${a[$i,15]} ${a[$i,16]} ${a[$i,17]} ${a[$i,18]} ${a[$i,19]} ${a[$i,20]} ${a[$i,21]} ${a[$i,22]} ${a[$i,23]} ${a[$i,24]} ${a[$i,25]} >> $p_newFileName
echo "Created transposed file: $p_newFileName ."
ls_current_date2=`date +'%Y-%m-%d'`
ls_current_time2=`date +'%H%M%S'`
ls_run_name2="${ls_current_date2}"_"${ls_current_time2}"
echo "Completed "
echo "End time : $ls_run_name2 " >> "${p_PreprocessingPath}/log.txt"
`
Below you can find a sample of the file ( entry 1 is the header of the file and I do not need it at all ) .
version: 1
# entry-id: 1
dn: ou=CONNECTIONS,c=NL,o=Mobile
modifyTimestamp: 20130223124344Z
modifiersName: cn=directory manager
aci: (targetattr = "*")
# entry-id: 3
dn: kmmsisdn=31653440000,ou=CONNECTIONS,c=NL,o=Mobile
modifyTimestamp: 20331210121726Z
modifiersName: cn=directory manager
cn: MCCS
kmBrandName: VOID
kmBlockedNetwork: N
kmNetworkOperatorBlocked: N
kmCustomerBlocked: N
kmMsisdn: 31653440000
objectClass: top
objectClass: device
objectClass: kmConnection
serialNumber: 204084400000000
kmServiceProvider: 1
kmVoiceTan: 25
kmSubscriptionType: FLEXI
kmPaymentMethod: ABO
kmMccsDate: 22/03/2004
nsUniqueId: 2b72cfe9-f8b221d9-80088800-00000000
# entry-id: 4
dn: kmmsisdn=31153128215,ou=CONNECTIONS,c=NL,o=Mobile
modifyTimestamp: 22231210103328Z
modifiersName: cn=directory manager
cn: MCCS
kmMmpM3cStatusDate: 12/01/2012
kmMmpM3cStatus: Potential
kmBrandName: VOID
kmBlockedNetwork: N
kmNetworkOperatorBlocked: N
kmCustomerBlocked: N
kmMsisdn: 31153128215
objectClass: top
objectClass: device
objectClass: kmConnection
objectClass: kmMultiMediaPortalService
serialNumber: 214283011000000
kmServiceProvider: 1
kmVoiceTan: 25
kmSubscriptionType: FLEXI
kmPaymentMethod: ABO
kmMccsDate: 22/03/2004
nsUniqueId: 92723fea-f8e211d9-8011000-01110000
If this is not achievable with shell scripting. Can you please suggest something that would do it faster ( perl, python ). I don't know any other scripting language but I can learn :) .
case "${Column_Name}" in ..)? Can you also remove the test on $i by parsing the lines before the first# entry-idbefore entering the loop? Do you also have a lot unused attibutes (preprocess using grep or use continue in the loop when 25 columns are filled) ?var="# entry-id"can be moved above the loop when you use another varname for it. And how aboutwhile IFS=: read -r Column_Name Column_Value