Skip to content

Commit

Permalink
Fix an issue of using wrong parent taxid in silva-download script
Browse files Browse the repository at this point in the history
  • Loading branch information
mourisl committed Jun 17, 2024
1 parent 1d1e0f2 commit 20a0913
Showing 1 changed file with 27 additions and 8 deletions.
35 changes: 27 additions & 8 deletions indices/silva-download.pl
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,12 @@ sub system_call
# download and create the taxonomy tree and name file
$prefix = "tax_slv_".lc($subunit)."_$silvaVer" ;
system_call("wget $weblink/taxonomy/${prefix}.txt.gz") ;

# Archaea;Aenigmarchaeota; 11084 phylum 123
open FP, "zcat ${prefix}.txt.gz |" ;
open FPnodes, ">nodes.dmp" ;
open FPnames, ">names.dmp" ;
print FPnodes "1\t|\t1\t|\tno rank\t|\n" ;
my %nameMap ;
open FPnames, ">$outputDir/names.dmp" ;
# Get the names
print FPnames "1\t|\troot\t|\tscientific name\t|\n" ;
while (<FP>)
{
Expand All @@ -75,19 +76,37 @@ sub system_call
my @nameCols = split /;/, $cols[0] ;
my $name = $nameCols[scalar(@nameCols) - 1] ;
my $tax = $cols[1] ;
my $parent = $cols[4] ;
$nameMap{$name} = $tax ;

print FPnames "$tax\t|\t$name\t|\tscientific name\t|\n" ;
}
close FPnames ;
close FP ;

open FP, "zcat ${prefix}.txt.gz |" ;
open FPnodes, ">$outputDir/nodes.dmp" ;
print FPnodes "1\t|\t1\t|\tno rank\t|\n" ;
while (<FP>)
{
chomp ;
my @cols = split /\t/, $_ ;
my @nameCols = split /;/, $cols[0] ;
my $tax = $cols[1] ;
my $parentName ;
$parentName = $nameCols[scalar(@nameCols) - 2] if (scalar(@nameCols) > 1) ;
my $parent ;
$parent = $nameMap{$parentName} if (defined $parentName && defined $nameMap{$parentName}) ;
$parent = 1 if (!defined $parent) ;
print FPnodes "$tax\t|\t$parent\t|\t".$cols[2]."\t|\n" ;
print FPnames "$tax\t|\t$name\t|\tscientific name\t|\n" ;
}
close FPnodes ;
close FPnames ;
close FP ;

unlink "${prefix}.txt.gz" ;

# download the seqid_to_taxid map file
system_call("wget $weblink/taxonomy/${prefix}.acc_taxid.gz") ;
system_call("zcat ${prefix}.acc_taxid.gz > silva_seqid_to_taxid.map") ;
system_call("zcat ${prefix}.acc_taxid.gz > $outputDir/silva_seqid_to_taxid.map") ;
unlink "${prefix}.acc_taxid.gz" ;

# download the genome file
Expand All @@ -99,7 +118,7 @@ sub system_call

system_call("wget ${weblink}/${prefix}tax_silva.fasta.gz") ;
open FP, "zcat ${prefix}tax_silva.fasta.gz |" ;
open FPout, "| gzip -c > silva_seq.fa.gz" ;
open FPout, "| gzip -c > $outputDir/silva_seq.fa.gz" ;
my %seqToTax ;
while (<FP>)
{
Expand Down

0 comments on commit 20a0913

Please sign in to comment.