Skip to content

Commit

Permalink
Moving loading and save sigs to rust
Browse files Browse the repository at this point in the history
move json parsing and init to rust
working on loading sigs

55 failing. Now it's failing because SBT index is saving all signatures
(instead of only the one it was used to build the tree).
This was actually a feature (see #198) but it broke the SBT code
(it wasn't ready for that!)
  • Loading branch information
luizirber committed Oct 2, 2018
1 parent d32e2fc commit 2f4981c
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 454 deletions.
9 changes: 9 additions & 0 deletions sourmash/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,12 @@ def implements_to_string(cls):
itervalues = lambda x: x.values()
NUL = 0
implements_to_string = lambda x: x


def to_bytes(s):
if not isinstance(s, string_types + (bytes,)):
raise TypeError("Requires a string-like sequence")

if isinstance(s, string_types):
s = s.encode('utf-8')
return s
16 changes: 5 additions & 11 deletions sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import math
import copy

from ._compat import string_types, range_type
from ._lowlevel import ffi, lib
from ._compat import to_bytes
from .utils import RustObject, rustcall

# default MurmurHash seed
Expand Down Expand Up @@ -40,15 +40,6 @@ def get_scaled_for_max_hash(max_hash):
return int(round(get_minhash_max_hash() / max_hash, 0))


def to_bytes(s):
if not isinstance(s, string_types + (bytes,)):
raise TypeError("Requires a string-like sequence")

if isinstance(s, string_types):
s = s.encode('utf-8')
return s


def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED):
"hash_murmur(string, [,seed])\n\n"
"Compute a hash for a string, optionally using a seed (an integer). "
Expand Down Expand Up @@ -84,7 +75,6 @@ class MinHash(RustObject):

def __init__(self, n, ksize, is_protein=False, track_abundance=False,
seed=MINHASH_DEFAULT_SEED, max_hash=0, mins=None, scaled=0):
self.track_abundance = track_abundance

if max_hash and scaled:
raise ValueError('cannot set both max_hash and scaled')
Expand Down Expand Up @@ -196,6 +186,10 @@ def subtract_mins(self, other):
b = set(other.get_mins())
return a - b

@property
def track_abundance(self):
return self._methodcall(lib.kmerminhash_track_abundance)

@property
def seed(self):
return self._methodcall(lib.kmerminhash_seed)
Expand Down
162 changes: 128 additions & 34 deletions sourmash/signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,56 @@
"""
from __future__ import print_function
import hashlib
import weakref

import gzip
import bz2file
import io
import sys

from . import signature_json
from .logging import error


from .logging import error
from .minhash import MinHash

from ._compat import to_bytes
from ._lowlevel import ffi, lib
from .utils import RustObject, rustcall, decode_str


SIGNATURE_VERSION=0.4


class SourmashSignature(object):
sig_refs = weakref.WeakKeyDictionary()
mhs_refs = weakref.WeakKeyDictionary()


class SourmashSignature(RustObject):
"Main class for signature information."
_name = ''
filename = ''

def __init__(self, minhash, name='', filename=''):
self.d = {}
self.d['class'] = 'sourmash_signature'
self._objptr = lib.signature_new()

if name:
self.d['name'] = name
self._name = name
if filename:
self.d['filename'] = filename
self.filename = filename

self.minhash = minhash
self.d['license'] = 'CC0'

self.__dealloc_func__ = lib.signature_free

@property
def minhash(self):
return MinHash._from_objptr(self._methodcall(lib.signature_first_mh), shared=True)

@minhash.setter
def minhash(self, value):
# TODO: validate value is a MinHash
self._methodcall(lib.signature_push_mh, value._objptr)

def __hash__(self):
return hash(self.md5sum())
Expand All @@ -42,6 +66,19 @@ def __str__(self):
return "SourmashSignature({})".format(md5pref)
__repr__ = __str__

def minhashes(self):
size = ffi.new("uintptr_t *")
mhs_ptr = self._methodcall(lib.signature_get_mhs, size)
size = ffi.unpack(size, 1)[0]

mhs = []
for i in range(size):
mh = MinHash._from_objptr(mhs_ptr[i], shared=True)
mhs.append(mh)
# mhs_refs[mh] = mh

return mhs

def md5sum(self):
"Calculate md5 hash of the bottom sketch, specifically."
m = hashlib.md5()
Expand All @@ -51,31 +88,53 @@ def md5sum(self):
return m.hexdigest()

def __eq__(self, other):
allkeys = set(self.d.keys()).union(set(other.d.keys()))
for k in allkeys:
if self.d.get(k) != other.d.get(k):
return False
return self._methodcall(lib.signature_eq, other._objptr)

@property
def _name(self):
return decode_str(self._methodcall(lib.signature_get_name), free=True)

return self.minhash == other.minhash
@_name.setter
def _name(self, value):
self._methodcall(lib.signature_set_name, to_bytes(value))

def name(self):
"Return as nice a name as possible, defaulting to md5 prefix."
if 'name' in self.d:
return self.d.get('name')
elif 'filename' in self.d:
return self.d.get('filename')
name = self._name
filename = self.filename

if name:
return name
elif filename:
return filename
else:
return self.md5sum()[:8]

@property
def filename(self):
return decode_str(self._methodcall(lib.signature_get_filename), free=True)

@filename.setter
def filename(self, value):
self._methodcall(lib.signature_set_filename, to_bytes(value))

@property
def license(self):
return decode_str(self._methodcall(lib.signature_get_license), free=True)

def _display_name(self, max_length):
if 'name' in self.d:
name = self.d['name']
if len(name) > max_length:
name = name[:max_length - 3] + '...'
elif 'filename' in self.d:
name = self.d['filename']
if len(name) > max_length:
name = '...' + name[-max_length + 3:]
name = self._name
filename = self.filename

if name:
if name == filename:
name = self.d['filename']
if len(name) > max_length:
name = '...' + name[-max_length + 3:]
else:
name = self.d['name']
if len(name) > max_length:
name = name[:max_length - 3] + '...'
else:
name = self.md5sum()[:8]
assert len(name) <= max_length
Expand Down Expand Up @@ -215,22 +274,44 @@ def load_signatures(data, ksize=None, select_moltype=None,
if sys.version_info >= (3, ):
data = data.buffer

size = ffi.new("uintptr_t *")

try:
data = data.read()
except AttributeError:
pass

try:
# JSON format
for sig in signature_json.load_signatures_json(data,
ignore_md5sum=ignore_md5sum):
if not ksize or ksize == sig.minhash.ksize:
if not select_moltype or \
sig.minhash.is_molecule_type(select_moltype):
yield sig
if is_fp:
sigs_ptr = rustcall(lib.signatures_load_buffer, data, ignore_md5sum, size)
#fp_c = ffi.cast("FILE *", data)
#sigs_ptr = rustcall(lib.signatures_load_file, fp_c, ignore_md5sum, size)
else:
sigs_ptr = rustcall(lib.signatures_load_buffer, data.encode('utf-8'), ignore_md5sum, size)

size = ffi.unpack(size, 1)[0]

sigs = []
for i in range(size):
sig = SourmashSignature._from_objptr(sigs_ptr[i], shared=True)
sigs.append(sig)
sig_refs[sig] = sigs

for sig in sigs:
for minhash in sig.minhashes():
if not ksize or ksize == minhash.ksize:
if not select_moltype or \
minhash.is_molecule_type(select_moltype):
yield sig
except Exception as e:
error("Error in parsing signature; quitting.")
error("Exception: {}", str(e))
if do_raise:
raise
finally:
if is_fp:
data.close()
# finally:
# if is_fp:
# data.close()


def load_one_signature(data, ksize=None, select_moltype=None,
Expand All @@ -254,4 +335,17 @@ def load_one_signature(data, ksize=None, select_moltype=None,

def save_signatures(siglist, fp=None):
"Save multiple signatures into a JSON string (or into file handle 'fp')"
return signature_json.save_signatures_json(siglist, fp)
collected = [obj._get_objptr() for obj in siglist]
siglist_c = ffi.new("Signature*[]", collected)

if fp is None:
buf = rustcall(lib.signatures_save_buffer, siglist_c, len(collected))
else:
#fp_c = ffi.cast("FILE *", fp)
#buf = rustcall(lib.signatures_save_file, siglist_c, len(collected), fp_c)
buf = rustcall(lib.signatures_save_buffer, siglist_c, len(collected))
result = decode_str(buf, free=True)
fp.write(result)
return None

return decode_str(buf, free=True)
Loading

0 comments on commit 2f4981c

Please sign in to comment.