-
Notifications
You must be signed in to change notification settings - Fork 1
/
hashes-to-numpy-2.py
executable file
·115 lines (90 loc) · 3.76 KB
/
hashes-to-numpy-2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#! /usr/bin/env python
"""
Given a list of sourmash signatures, find the common hashes and then
output a hash co-occurrence matrix that can be used to cluster samples.
"""
from __future__ import print_function
import argparse
import collections
import sourmash_lib.signature
import numpy
def main():
p = argparse.ArgumentParser()
p.add_argument('inp_signatures', nargs='+')
p.add_argument('-k', '--ksize', type=int, default=31)
p.add_argument('--scaled', type=int, default=100000)
p.add_argument('-o', '--output-name')
p.add_argument('--threshold', type=int, default=2)
p.add_argument('--max-threshold', type=int, default=None)
p.add_argument('--frequency', action='store_true')
p.add_argument('--intersect', nargs='+',
help='only use hashes in the given files')
args = p.parse_args()
counts = collections.Counter()
intersect_hashes = set()
if args.intersect:
for n, filename in enumerate(args.intersect):
print('...loading intersect {}'.format(n + 1), end='\r')
sig = sourmash_lib.signature.load_one_signature(filename,
ksize=args.ksize)
mh = sig.minhash.downsample_scaled(args.scaled)
hashes = mh.get_mins()
intersect_hashes.update(hashes)
print('')
print('loading signatures from', len(args.inp_signatures), 'files')
sig_hashes = {}
for n, filename in enumerate(args.inp_signatures):
print('... {}'.format(n + 1), end='\r')
sig = sourmash_lib.load_one_signature(filename, ksize=args.ksize)
mh = sig.minhash.downsample_scaled(args.scaled)
hashes = mh.get_mins()
if intersect_hashes:
hashes = set(hashes)
hashes.intersection_update(intersect_hashes)
sig_hashes[filename] = hashes
for k in hashes:
counts[k] += 1
print('\n...done. Now finding common hashes among >= {} samples'.format(args.threshold))
n = 0
abundant_hashes = set()
for hash, count in counts.most_common():
if args.max_threshold and count > args.max_threshold:
continue
if count < args.threshold:
break
n += 1
abundant_hashes.add(hash)
print('found', n, 'hashes from', len(args.inp_signatures), 'signatures')
print('min threshold: {}'.format(args.threshold))
# go over the files again, this time creating an n x n_files matrix
# with 0 etc.
pa = numpy.zeros((len(abundant_hashes), len(abundant_hashes)),
dtype=numpy.float)
# sort for no particular reason
hashlist = list(sorted(abundant_hashes))
hashdict = {}
for n, k in enumerate(hashlist):
hashdict[k] = n # hash -> index in hashlist
print('calculating matrix {} x {}'.format(len(abundant_hashes),
len(abundant_hashes)))
print('iterate x 2 signatures from', len(args.inp_signatures), 'files')
for fn, (filename, hashes) in enumerate(sig_hashes.items()):
print('... {}'.format(fn + 1), end='\r')
x = abundant_hashes.intersection(hashes)
for hashval in x:
for hashval2 in x:
idx = hashdict[hashval]
idx2 = hashdict[hashval2]
if args.frequency:
pa[idx2][idx] += 1
else:
pa[idx2][idx] = 1
if args.frequency:
pa /= len(sig_hashes)
print('\ndone! saving to:', args.output_name)
with open(args.output_name, 'wb') as fp:
numpy.save(fp, pa)
with open(args.output_name + '.labels.txt', 'w') as fp:
fp.write("\n".join(map(str, hashlist)))
if __name__ == '__main__':
main()