-
Notifications
You must be signed in to change notification settings - Fork 2
/
SingleLinkageClusterer.pm
executable file
·111 lines (77 loc) · 2.44 KB
/
SingleLinkageClusterer.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
package main;
our $CLUSTERPATH;
package SingleLinkageClusterer;
## package not to be instantiated. Just provides a namespace.
## Input: Array containing array-refs of pairs:
## @_ = ( [1,2], [2,3], [6,7], [7,8], ...)
## Output: Array of all clusters as array-refs.
## return ([1,2,3] , [6,7,8], ...)
use strict;
our $JACCARD_COEFF;
sub build_clusters {
my @pairs = @_;
my $uniq_stamp = "$$." . time() . "." . rand();
my $pairfile = "/tmp/$uniq_stamp.pairs";
#must do mapping because cluster program doesn't like word chars, just ints.
my %map_id_to_feat;
my %map_feat_to_id;
my $id = 1;
open (PAIRLIST, ">$pairfile") or die "Can't write $pairfile to /tmp";
foreach my $pair (@pairs) {
my ($a, $b) = @$pair;
unless ($map_feat_to_id{$a}) {
$map_feat_to_id{$a} = $id;
$map_id_to_feat{$id} = $a;
$id++;
}
unless ($map_feat_to_id{$b}) {
$map_feat_to_id{$b} = $id;
$map_id_to_feat{$id} = $b;
$id++;
}
print PAIRLIST "$map_feat_to_id{$a} $map_feat_to_id{$b}\n";
}
close PAIRLIST;
my $clusterfile = "/tmp/$uniq_stamp.clusters";
my $cluster_prog = "slclust";
if ($CLUSTERPATH) {
$cluster_prog = $CLUSTERPATH;
}
if ($JACCARD_COEFF) {
$cluster_prog .= " -j $JACCARD_COEFF";
}
system "touch $clusterfile";
unless (-w $clusterfile) { die "Can't write $clusterfile";}
my $cmd = "ulimit -s unlimited && $cluster_prog < $pairfile > $clusterfile";
my $ret = system ($cmd);
if ($ret) {
die "ERROR: Couldn't run cluster properly via path: $cluster_prog.\ncmd: $cmd";
}
my @clusters;
open (CLUSTERS, $clusterfile);
while (my $line = <CLUSTERS>) {
my @elements;
while ($line =~ /(\d+)\s?/g) {
push (@elements, $map_id_to_feat{$1});
}
if (@elements) {
push (@clusters, [@elements]);
}
}
close CLUSTERS;
## clean up
unlink ($pairfile, $clusterfile);
return (@clusters);
}
############
## Testing
###########
sub __run_test {
my @pairs = ( [1,2], [2,3], [4,5] );
my @clusters = &SingleLinkageClusterer::build_clusters(@pairs);
use Data::Dumper;
print "Input: " . Dumper(\@pairs);
print "Output: " . Dumper(\@clusters);
exit(0);
}
1;