-
Notifications
You must be signed in to change notification settings - Fork 0
/
fasta-remove-invariant.pl
62 lines (51 loc) · 1.62 KB
/
fasta-remove-invariant.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/perl
use warnings;
use strict;
use File::Basename;
use Bio::SeqIO;
if(@ARGV < 1){
die "Usage: ".basename($0). "alnIn.fasta[.gz] >| alnOut.fasta\n";
}
my $file = shift @ARGV;
my $cmd = ($file =~ /\.gz$/) ? "zcat" : "cat";
open(my $fh, "$cmd $file |") or die "Could not open $file: $!\n";
my $SeqIO = Bio::SeqIO->new(-fh => $fh, -format => 'fasta');
my @labels; #sequence ides corresponding to rows
my @columns; #array of array ref of characters
#@{$columns[i]} = a column in the alignment
#map {$_->[j]} @columns = a row in the alignment
my @variants; #array of index keyed hash ref of uniq chars
my $length = undef;
my $N = 0;
while(my $seqObj = $SeqIO->next_seq){
push(@labels,$seqObj->id);
my @chars = split("",uc($seqObj->seq));
$N++;
if(defined $length){
unless($seqObj->length == $length){
die "All input sequences are not the same length: @{[$seqObj->id]} differs\n"
}
} else {
@columns = map {[$_]} @chars;
@variants = map {{$_ => 1}} @chars;
$length = $seqObj->length;
next;
}
for (my $i=0; $i< @columns; $i++){
push(@{$columns[$i]},$chars[$i]);
$variants[$i]->{$chars[$i]} = 1;
}
}
close($fh);
my @variantSites;
for(my $i =0; $i < @columns;$i++){
delete $variants[$i]->{"N"};
push(@variantSites,$i) if(scalar(keys %{$variants[$i]}) > 1);
}
@columns = @columns[@variantSites];
my $Output = Bio::SeqIO->newFh(-format => 'fasta');
for(my $j = 0; $j < $N;$j++){
my $seq = join("",map {$_->[$j]} @columns);
my $seqObj = Bio::Seq->new(-id => $labels[$j], seq => $seq);
print $Output $seqObj;
}