AB006589__ESR2,BC024181__ESR2,0.47796
AB006589__ESR2,X55739__CSN2,0.47232
AB006589__ESR2,NM_004991__MDS1,0.46704
AB006589__ESR2,NM_003476__CSRP3,0.45767
AB006589__ESR2,NM_012101__TRIM29,0.45094
AB006589__ESR2,NM_006897__HOXC9,0.41748
AB006589__ESR2,NM_000278__PAX2,0.4161
NM_003476__CSRP3,AB006589__ESR2,0.45767
NM_012101__TRIM29,AB006589__ESR2,0.45094
NM_006897__HOXC9,AB006589__ESR2,0.41748
NM_000278__PAX2,AB006589__ESR2,0.4161
Now, the problem is that line 4
AB006589__ESR2,NM_003476__CSRP3,0.45767
is a duplicate of line 8
NM_003476__CSRP3,AB006589__ESR2,0.45767
There are many cases like this in my large CSV file.
So, my question is to identify all duplicates and somehow delete one of them.
use strict;
my %hash = ();
open(tf, "tf_tf_mic.csv");
while ( <tf> ) {
chomp;
# print "$_\n";
my @words = split ",", $_;
if ( exists $hash{"$words[0]\t$words[1]"} || exists $hash{"$words[1]\t$words[0]"} ) {
}
else{
$hash{"$words[0]\t$words[1]"} = $_;
}
}
foreach ( keys %hash ) {
print "$hash{$_}\n";
}
This actually worked in 10 seconds for a 4 million line file.
perl?