blanket implementation for counter_gather

sourmash-bio · Feb 7, 2021 · 341081e · 341081e
1 parent 059b90c
commit 341081e
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 2 deletions.
diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py
@@ -1,7 +1,7 @@
 "An Abstract Base Class for collections of signatures."
 
 from abc import abstractmethod, ABC
-from collections import namedtuple
+from collections import namedtuple, Counter
 
 
 class Index(ABC):
@@ -117,6 +117,69 @@ def gather(self, query, *args, **kwargs):
 
         return results
 
+    def counter_gather(self, query, *args, **kwargs):
+        "Perform compositional analysis of the query using the gather algorithm"
+        if not query.minhash:             # empty query? quit.
+            return []
+
+        scaled = query.minhash.scaled
+        if not scaled:
+            raise ValueError('gather requires scaled signatures')
+
+        threshold_bp = kwargs.get('threshold_bp', 0.0)
+        threshold = 0.0
+        n_threshold_hashes = 0
+
+        # are we setting a threshold?
+        if threshold_bp:
+            # if we have a threshold_bp of N, then that amounts to N/scaled
+            # hashes:
+            n_threshold_hashes = float(threshold_bp) / scaled
+
+            # that then requires the following containment:
+            threshold = n_threshold_hashes / len(query.minhash)
+
+            # is it too high to ever match? if so, exit.
+            if threshold > 1.0:
+                return []
+
+        # Pre-loading signatures so we can index datasets
+        signatures = list(self.signatures())
+
+        # Process all datasets and create a Counter containing the size
+        # of hashes in common between query and each signature
+        counter = Counter()
+        for (i, ss) in enumerate(signatures):
+            counter[i] = query.minhash.count_common(ss.minhash, True)
+
+        # Decompose query into matching signatures using a greedy approach (gather)
+        results = []
+        match_size = n_threshold_hashes
+        while counter and match_size >= n_threshold_hashes:
+            most_common = counter.most_common()
+            dataset_id, size = most_common[0]
+            if size >= n_threshold_hashes:
+                match_size = size
+            else:
+                break
+
+            match = signatures[dataset_id]
+            del counter[dataset_id]
+            cont = query.minhash.contained_by(match.minhash, True)
+            if cont and cont >= threshold:
+                results.append((cont, match, getattr(self, "filename", None)))
+
+            # Prepare counter for finding the next match by decrementing
+            # all hashes found in the current match in other datasets
+            for (dataset_id, _) in most_common:
+                counter[dataset_id] -= signatures[dataset_id].minhash.count_common(match.minhash, True)
+                if counter[dataset_id] == 0:
+                    del counter[dataset_id]
+
+        results.sort(reverse=True, key=lambda x: (x[0], x[1].md5sum()))
+
+        return results
+
     @abstractmethod
     def select(self, ksize=None, moltype=None):
         ""

diff --git a/src/sourmash/search.py b/src/sourmash/search.py
@@ -85,7 +85,7 @@ def _find_best(dblist, query, threshold_bp):
 
     # search across all databases
     for (obj, filename, filetype) in dblist:
-        for cont, match, fname in obj.gather(query, threshold_bp=threshold_bp):
+        for cont, match, fname in obj.counter_gather(query, threshold_bp=threshold_bp):
             assert cont                   # all matches should be nonzero.
 
             # note, break ties based on name, to ensure consistent order.