Revisions to How to compare two column of two file and print not matching pattern with awk

deleted 51 characters in body

Source Link

edited Feb 23, 2021 at 11:19

41.9k
17
75
118

Alternative GNU awk for the word-boundries ("\<" and "\>"):

gawkawk 'BEGIN{ FS=OFS="\t" }
NR==FNR{ mutations[$1] =$2; next }

FNR>1  {
         split($3, muts, "," );
         for(x in muts) { 
             if (gsub( "\\<"muts[x]"\\>",?"muts[x]",?", "", mutations[$2])>0) delete muts[x] }
       }

FNR==1 { $4="missing_mutation"; $5="remaining_mutation" }

{ printf ("%s", $0 OFS mutations[$2] OFS );
  for(r in muts) {
      if(muts[r] ~/^[Ss]/) printf("%s", sep muts[r]); sep="," }
  print ""; sep=""
}' fileB  fileA

Alternative GNU awk for the word-boundries ("\<" and "\>"):

gawk 'BEGIN{ FS=OFS="\t" }
NR==FNR{ mutations[$1] =$2; next }

FNR>1  {
         split($3, muts, "," );
         for(x in muts) { 
             if (gsub( "\\<"muts[x]"\\>,", "", mutations[$2])>0) delete muts[x] }
       }

FNR==1 { $4="missing_mutation"; $5="remaining_mutation" }

{ printf ("%s", $0 OFS mutations[$2] OFS );
  for(r in muts) {
      if(muts[r] ~/^[Ss]/) printf("%s", sep muts[r]); sep="," }
  print ""; sep=""
}' fileB  fileA

Alternative awk:

awk 'BEGIN{ FS=OFS="\t" }
NR==FNR{ mutations[$1] =$2; next }

FNR>1  {
         split($3, muts, "," );
         for(x in muts) { 
             if (gsub( ",?"muts[x]",?", "", mutations[$2])>0) delete muts[x] }
       }

FNR==1 { $4="missing_mutation"; $5="remaining_mutation" }

{ printf ("%s", $0 OFS mutations[$2] OFS );
  for(r in muts) {
      if(muts[r] ~/^[Ss]/) printf("%s", sep muts[r]); sep="," }
  print ""; sep=""
}' fileB  fileA

added 27 characters in body

Source Link

edited Feb 23, 2021 at 11:06

αғsнιη

41.9k
17
75
118

Alternative GNU awk for the word-boundries ("\<" and "\>"):

gawk 'BEGIN{ FS=OFS="\t" }
NR==FNR{ mutations[$1] =$2; next }

FNR>1  {
         split($3, muts, "," );
         for(x in muts) { 
             if (gsub( "\\<"muts[x]"\\>,", "", mutations[$2])>0) delete muts[x] }
       }

FNR==1 { $4="missing_mutation"; $5="remaining_mutation" }

{ printf ("%s", $0 OFS mutations[$2] OFS );
  sep=""; for(r in muts) {
      if(muts[r] ~/^[Ss]/) printf("%s", sep muts[r]); sep="," }
  print """"; sep=""
}' fileB  fileA

Output:

id      clade   mutation        missing_mutation        remaining_mutation
243     40A     siti,toto,mumu  xixi,saxa       siti,mumu
254
267     40B     lala,sisi,sojo  huhu    sisi

Alternative GNU awk for the word-boundries ("\<" and "\>"):

gawk 'BEGIN{ FS=OFS="\t" }
NR==FNR{ mutations[$1] =$2; next }

FNR>1  {
         split($3, muts, "," );
         for(x in muts) { 
             if (gsub( "\\<"muts[x]"\\>,", "", mutations[$2])>0) delete muts[x] }
       }

FNR==1 { $4="missing_mutation"; $5="remaining_mutation" }

{ printf ("%s", $0 OFS mutations[$2] OFS );
  sep=""; for(r in muts) { printf("%s", sep muts[r]); sep="," }
  print ""
}' fileB  fileA

Output:

id      clade   mutation        missing_mutation        remaining_mutation
243     40A     siti,toto,mumu  xixi,saxa       siti,mumu
254
267     40B     lala,sisi,sojo  huhu    sisi

Alternative GNU awk for the word-boundries ("\<" and "\>"):

gawk 'BEGIN{ FS=OFS="\t" }
NR==FNR{ mutations[$1] =$2; next }

FNR>1  {
         split($3, muts, "," );
         for(x in muts) { 
             if (gsub( "\\<"muts[x]"\\>,", "", mutations[$2])>0) delete muts[x] }
       }

FNR==1 { $4="missing_mutation"; $5="remaining_mutation" }

{ printf ("%s", $0 OFS mutations[$2] OFS );
  for(r in muts) {
      if(muts[r] ~/^[Ss]/) printf("%s", sep muts[r]); sep="," }
  print ""; sep=""
}' fileB  fileA

Source Link

answered Feb 23, 2021 at 10:58

αғsнιη

41.9k
17
75
118

Alternative GNU awk for the word-boundries ("\<" and "\>"):

gawk 'BEGIN{ FS=OFS="\t" }
NR==FNR{ mutations[$1] =$2; next }

FNR>1  {
         split($3, muts, "," );
         for(x in muts) { 
             if (gsub( "\\<"muts[x]"\\>,", "", mutations[$2])>0) delete muts[x] }
       }

FNR==1 { $4="missing_mutation"; $5="remaining_mutation" }

{ printf ("%s", $0 OFS mutations[$2] OFS );
  sep=""; for(r in muts) { printf("%s", sep muts[r]); sep="," }
  print ""
}' fileB  fileA

Output:

id      clade   mutation        missing_mutation        remaining_mutation
243     40A     siti,toto,mumu  xixi,saxa       siti,mumu
254
267     40B     lala,sisi,sojo  huhu    sisi

Stack Exchange Network

Return to Answer