Skip to content

Commit

Permalink
Use a Nodegraph for searching in internal nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
luizirber committed Feb 15, 2018
1 parent f469b5a commit c41fecf
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 13 deletions.
4 changes: 2 additions & 2 deletions sourmash_lib/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ def data(self):
with NamedTemporaryFile(suffix=".gz") as f:
f.write(data)
f.file.flush()
self._data = khmer.load_nodegraph(f.name)
self._data = khmer.Nodegraph.load(f.name)
return self._data

@data.setter
Expand Down Expand Up @@ -729,7 +729,7 @@ def data(self):
with NamedTemporaryFile(suffix=".gz") as f:
f.write(data)
f.file.flush()
self._data = khmer.load_nodegraph(f.name)
self._data = khmer.Nodegraph.load(f.name)
return self._data

@data.setter
Expand Down
24 changes: 13 additions & 11 deletions sourmash_lib/sbtmh.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,29 +73,31 @@ def data(self, new_data):
self._data = new_data


def search_minhashes(node, sig, threshold, results=None, downsample=True):
mins = sig.minhash.get_mins()
def search_minhashes(node, query, threshold, results=None, downsample=True):
score = 0

if isinstance(node, SigLeaf):
try:
score = node.data.minhash.similarity(sig.minhash)
score = node.data.minhash.similarity(query.minhash)
except Exception as e:
if 'mismatch in max_hash' in str(e) and downsample:
xx = sig.minhash.downsample_max_hash(node.data.minhash)
yy = node.data.minhash.downsample_max_hash(sig.minhash)
xx = query.minhash.downsample_max_hash(node.data.minhash)
yy = node.data.minhash.downsample_max_hash(query.minhash)

score = yy.similarity(xx)
else:
raise

else: # Node or Leaf, Nodegraph by minhash comparison
if len(mins):
matches = sum(1 for value in mins if node.data.get(value))
max_mins = node.metadata.get('max_n_below', -1)
if max_mins == -1:
raise Exception('cannot do similarity search on this SBT; need to rebuild.')
score = float(matches) / max_mins
try:
query_bf = query.bf
except AttributeError:
query_bf = node._factory()
for v in query.minhash.get_mins():
query_bf.count(v)
query.bf = query_bf

score = node.data.similarity(query_bf)

if results is not None:
results[node.name] = score
Expand Down

0 comments on commit c41fecf

Please sign in to comment.