Skip to content

Commit

Permalink
blanket implementation for counter_gather
Browse files Browse the repository at this point in the history
  • Loading branch information
luizirber committed Feb 7, 2021
1 parent 059b90c commit 341081e
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 2 deletions.
65 changes: 64 additions & 1 deletion src/sourmash/index/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"An Abstract Base Class for collections of signatures."

from abc import abstractmethod, ABC
from collections import namedtuple
from collections import namedtuple, Counter


class Index(ABC):
Expand Down Expand Up @@ -117,6 +117,69 @@ def gather(self, query, *args, **kwargs):

return results

def counter_gather(self, query, *args, **kwargs):
"Perform compositional analysis of the query using the gather algorithm"
if not query.minhash: # empty query? quit.
return []

scaled = query.minhash.scaled
if not scaled:
raise ValueError('gather requires scaled signatures')

threshold_bp = kwargs.get('threshold_bp', 0.0)
threshold = 0.0
n_threshold_hashes = 0

# are we setting a threshold?
if threshold_bp:
# if we have a threshold_bp of N, then that amounts to N/scaled
# hashes:
n_threshold_hashes = float(threshold_bp) / scaled

# that then requires the following containment:
threshold = n_threshold_hashes / len(query.minhash)

# is it too high to ever match? if so, exit.
if threshold > 1.0:
return []

# Pre-loading signatures so we can index datasets
signatures = list(self.signatures())

# Process all datasets and create a Counter containing the size
# of hashes in common between query and each signature
counter = Counter()
for (i, ss) in enumerate(signatures):
counter[i] = query.minhash.count_common(ss.minhash, True)

# Decompose query into matching signatures using a greedy approach (gather)
results = []
match_size = n_threshold_hashes
while counter and match_size >= n_threshold_hashes:
most_common = counter.most_common()
dataset_id, size = most_common[0]
if size >= n_threshold_hashes:
match_size = size
else:
break

match = signatures[dataset_id]
del counter[dataset_id]
cont = query.minhash.contained_by(match.minhash, True)
if cont and cont >= threshold:
results.append((cont, match, getattr(self, "filename", None)))

# Prepare counter for finding the next match by decrementing
# all hashes found in the current match in other datasets
for (dataset_id, _) in most_common:
counter[dataset_id] -= signatures[dataset_id].minhash.count_common(match.minhash, True)
if counter[dataset_id] == 0:
del counter[dataset_id]

results.sort(reverse=True, key=lambda x: (x[0], x[1].md5sum()))

return results

@abstractmethod
def select(self, ksize=None, moltype=None):
""
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def _find_best(dblist, query, threshold_bp):

# search across all databases
for (obj, filename, filetype) in dblist:
for cont, match, fname in obj.gather(query, threshold_bp=threshold_bp):
for cont, match, fname in obj.counter_gather(query, threshold_bp=threshold_bp):
assert cont # all matches should be nonzero.

# note, break ties based on name, to ensure consistent order.
Expand Down

0 comments on commit 341081e

Please sign in to comment.