From 6adfda9f7ca6d7d526c400ce32d8f1713fcbfaac Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 13 Mar 2018 16:35:53 -0700 Subject: [PATCH] Moving loading and save sigs to rust move json parsing and init to rust working on loading sigs enable pickling for signatures sort mins and abunds remove dead code remove refs --- include/sourmash.h | 2 + setup.py | 3 +- sourmash/_minhash.py | 3 +- sourmash/commands.py | 3 +- sourmash/sig/__main__.py | 6 +- sourmash/signature.py | 365 +++++++++++++++++++++-------------- sourmash/signature_json.py | 310 ----------------------------- sourmash/sourmash_args.py | 2 +- src/ffi/minhash.rs | 13 ++ src/sketch/minhash.rs | 50 +++-- tests/test__minhash.py | 1 - tests/test_signature.py | 3 +- tests/test_signature_json.py | 163 ---------------- tests/test_sourmash.py | 2 +- 14 files changed, 281 insertions(+), 645 deletions(-) delete mode 100644 sourmash/signature_json.py delete mode 100644 tests/test_signature_json.py diff --git a/include/sourmash.h b/include/sourmash.h index 27575812bc..5f26b1fbac 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -109,6 +109,8 @@ uint32_t kmerminhash_ksize(KmerMinHash *ptr); uint64_t kmerminhash_max_hash(KmerMinHash *ptr); +SourmashStr kmerminhash_md5sum(KmerMinHash *ptr); + void kmerminhash_merge(KmerMinHash *ptr, const KmerMinHash *other); bool kmerminhash_is_compatible(const KmerMinHash *ptr, const KmerMinHash *other); diff --git a/setup.py b/setup.py index d6ce0e119b..29c6d23443 100644 --- a/setup.py +++ b/setup.py @@ -62,8 +62,7 @@ def build_native(spec): 'sourmash = sourmash.__main__:main' ] }, - "install_requires": ["screed>=0.9", "ijson>=2.5.1", "khmer>=2.1", 'numpy', - "cffi", + "install_requires": ["screed>=0.9", "khmer>=2.1", "cffi", 'numpy', 'matplotlib', 'scipy', "deprecation>=2.0.6"], "setup_requires": [ "setuptools>=38.6.0", diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py index 43d8be606f..e936112867 100644 --- a/sourmash/_minhash.py +++ b/sourmash/_minhash.py @@ -68,6 +68,8 @@ def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED): class MinHash(RustObject): + __dealloc_func__ = lib.kmerminhash_free + def __init__( self, n, @@ -98,7 +100,6 @@ def __init__( self._objptr = lib.kmerminhash_new( n, ksize, is_protein, dayhoff, hp, seed, int(max_hash), track_abundance ) - self.__dealloc_func__ = lib.kmerminhash_free if mins: if track_abundance: diff --git a/sourmash/commands.py b/sourmash/commands.py index 2b198a66d5..1dd0238129 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -1036,7 +1036,6 @@ def watch(args): ksize = tree_mh.ksize E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein, dayhoff=dayhoff, hp=hp) - streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name) notify('Computing signature for k={}, {} from stdin', ksize, moltype) @@ -1044,6 +1043,7 @@ def do_search(): search_fn = SearchMinHashesFindBest().search results = [] + streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name) for leaf in tree.find(search_fn, streamsig, args.threshold): results.append((streamsig.similarity(leaf.data), leaf.data)) @@ -1081,6 +1081,7 @@ def do_search(): if args.output: notify('saving signature to {}', args.output.name) + streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name) sig.save_signatures([streamsig], args.output) diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index 10c9765107..c78738b768 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -121,8 +121,8 @@ def describe(args): with_abundance = 1 md5 = sig.md5sum() name = sig.name() - filename = sig.d.get('filename', '') - license = sig.d['license'] + filename = sig.filename + license = sig.license if w: w.writerow(locals()) @@ -454,7 +454,7 @@ def rename(args): select_moltype=moltype) for sigobj in siglist: - sigobj.d['name'] = args.name + sigobj._name = args.name outlist.append(sigobj) if args.output: diff --git a/sourmash/signature.py b/sourmash/signature.py index fafcfd2ece..71c4fa625c 100644 --- a/sourmash/signature.py +++ b/sourmash/signature.py @@ -3,33 +3,47 @@ Save and load MinHash sketches in a JSON format, along with some metadata. """ from __future__ import print_function -import hashlib -import gzip -import bz2file -import io import sys +import os +import weakref -from . import signature_json from .logging import error +from . import MinHash +from ._minhash import to_bytes +from ._lowlevel import ffi, lib +from .utils import RustObject, rustcall, decode_str -SIGNATURE_VERSION=0.4 +SIGNATURE_VERSION = 0.4 -class SourmashSignature(object): +class SourmashSignature(RustObject): "Main class for signature information." - def __init__(self, minhash, name='', filename=''): - self.d = {} - self.d['class'] = 'sourmash_signature' + __dealloc_func__ = lib.signature_free + + def __init__(self, minhash, name="", filename=""): + self._objptr = lib.signature_new() + if name: - self.d['name'] = name + self._name = name if filename: - self.d['filename'] = filename + self.filename = filename self.minhash = minhash - self.d['license'] = 'CC0' + + + @property + def minhash(self): + return MinHash._from_objptr( + self._methodcall(lib.signature_first_mh) + ) + + @minhash.setter + def minhash(self, value): + # TODO: validate value is a MinHash + self._methodcall(lib.signature_set_mh, value._objptr) def __hash__(self): return hash(self.md5sum()) @@ -40,87 +54,85 @@ def __str__(self): if name != md5pref: return "SourmashSignature('{}', {})".format(name, md5pref) return "SourmashSignature({})".format(md5pref) + __repr__ = __str__ + #def minhashes(self): + # size = ffi.new("uintptr_t *") + # mhs_ptr = self._methodcall(lib.signature_get_mhs, size) + # size = ffi.unpack(size, 1)[0] + # + # mhs = [] + # for i in range(size): + # mh = MinHash._from_objptr(mhs_ptr[i]) + # mhs.append(mh) + # + # return mhs + def md5sum(self): "Calculate md5 hash of the bottom sketch, specifically." - m = hashlib.md5() - m.update(str(self.minhash.ksize).encode('ascii')) - for k in self.minhash.get_mins(): - m.update(str(k).encode('utf-8')) - return m.hexdigest() + return decode_str(self.minhash._methodcall(lib.kmerminhash_md5sum), free=True) def __eq__(self, other): - allkeys = set(self.d.keys()).union(set(other.d.keys())) - for k in allkeys: - if self.d.get(k) != other.d.get(k): - return False + return self._methodcall(lib.signature_eq, other._objptr) + + @property + def _name(self): + return decode_str(self._methodcall(lib.signature_get_name), free=True) - return self.minhash == other.minhash + @_name.setter + def _name(self, value): + self._methodcall(lib.signature_set_name, to_bytes(value)) def __ne__(self, other): return not self == other def name(self): "Return as nice a name as possible, defaulting to md5 prefix." - if 'name' in self.d: - return self.d.get('name') - elif 'filename' in self.d: - return self.d.get('filename') + name = self._name + filename = self.filename + + if name: + return name + elif filename: + return filename else: return self.md5sum()[:8] + @property + def filename(self): + return decode_str(self._methodcall(lib.signature_get_filename), free=True) + + @filename.setter + def filename(self, value): + self._methodcall(lib.signature_set_filename, to_bytes(value)) + + @property + def license(self): + return decode_str(self._methodcall(lib.signature_get_license), free=True) + def _display_name(self, max_length): - if 'name' in self.d: - name = self.d['name'] + name = self._name + filename = self.filename + + if name: if len(name) > max_length: - name = name[:max_length - 3] + '...' - elif 'filename' in self.d: - name = self.d['filename'] + name = name[: max_length - 3] + "..." + elif filename: + name = filename if len(name) > max_length: - name = '...' + name[-max_length + 3:] + name = "..." + name[-max_length + 3 :] else: name = self.md5sum()[:8] assert len(name) <= max_length return name - def _save(self): - "Return metadata and a dictionary containing the sketch info." - e = dict(self.d) - minhash = self.minhash - - sketch = {} - sketch['ksize'] = int(minhash.ksize) - sketch['num'] = minhash.num - sketch['max_hash'] = minhash.max_hash - sketch['seed'] = int(minhash.seed) - if self.minhash.track_abundance: - values = minhash.get_mins(with_abundance=True) - sketch['mins'] = list(map(int, values.keys())) - sketch['abundances'] = list(map(int, values.values())) - else: - sketch['mins'] = list(map(int, minhash.get_mins())) - sketch['md5sum'] = self.md5sum() - - if minhash.is_protein and not minhash.dayhoff and not minhash.hp: - sketch['molecule'] = 'protein' - elif minhash.dayhoff: - sketch['molecule'] = 'dayhoff' - elif minhash.hp: - sketch['molecule'] = 'hp' - else: - sketch['molecule'] = 'DNA' - - e['signature'] = sketch - - return self.d.get('name'), self.d.get('filename'), sketch - def similarity(self, other, ignore_abundance=False, downsample=False): - "Compute similarity with the other MinHash signature." + "Compute similarity with the other signature." try: return self.minhash.similarity(other.minhash, ignore_abundance) except ValueError as e: - if 'mismatch in max_hash' in str(e) and downsample: + if "mismatch in max_hash" in str(e) and downsample: xx = self.minhash.downsample_max_hash(other.minhash) yy = other.minhash.downsample_max_hash(self.minhash) return xx.similarity(yy, ignore_abundance) @@ -136,53 +148,46 @@ def contained_by(self, other, downsample=False): try: return self.minhash.contained_by(other.minhash) except ValueError as e: - if 'mismatch in max_hash' in str(e) and downsample: + if "mismatch in max_hash" in str(e) and downsample: xx = self.minhash.downsample_max_hash(other.minhash) yy = other.minhash.downsample_max_hash(self.minhash) return xx.contained_by(yy) else: raise + def __getstate__(self): # enable pickling + return ( + self.minhash, + self._name, + self.filename, + ) -def _guess_open(filename): - """ - Make a best-effort guess as to how to parse the given sequence file. - - Handles '-' as shortcut for stdin. - Deals with .gz and .bz2 as well as plain text. - """ - magic_dict = { - b"\x1f\x8b\x08": "gz", - b"\x42\x5a\x68": "bz2", - } # Inspired by http://stackoverflow.com/a/13044946/1585509 - - if filename == '-': - filename = '/dev/stdin' - - bufferedfile = io.open(file=filename, mode='rb', buffering=8192) - num_bytes_to_peek = max(len(x) for x in magic_dict) - file_start = bufferedfile.peek(num_bytes_to_peek) - compression = None - for magic, ftype in magic_dict.items(): - if file_start.startswith(magic): - compression = ftype - break - if compression is 'bz2': - sigfile = bz2file.BZ2File(filename=bufferedfile) - elif compression is 'gz': - if not bufferedfile.seekable(): - bufferedfile.close() - raise ValueError("gziped data not streamable, pipe through zcat \ - first") - sigfile = gzip.GzipFile(filename=filename) - else: - sigfile = bufferedfile - - return sigfile + def __setstate__(self, tup): + (mh, name, filename) = tup + self.__del__() + self._objptr = lib.signature_new() + if name: + self._name = name + if filename: + self.filename = filename + self.minhash = minhash -def load_signatures(data, ksize=None, select_moltype=None, - ignore_md5sum=False, do_raise=False, quiet=False): + def __reduce__(self): + return ( + SourmashSignature, + ( + self.minhash, + self._name, + self.filename + ), + ) + + +def load_signatures( + data, ksize=None, select_moltype=None, ignore_md5sum=False, do_raise=False, + quiet=False +): """Load a JSON string with signatures into classes. Returns list of SourmashSignature objects. @@ -196,44 +201,99 @@ def load_signatures(data, ksize=None, select_moltype=None, return is_fp = False - if hasattr(data, 'find') and data.find('sourmash_signature') == -1: # filename - done = False - try: # is it a file handle? - data.read - is_fp = True - done = True + is_filename = False + is_fobj = False + if hasattr(data, "fileno"): + is_fp = True + elif os.path.exists(data): # filename + is_filename = True + elif hasattr(data, "mode"): # file object-like + is_fobj = True + if "t" in data.mode: # need to reopen handler as binary + if sys.version_info >= (3,): + data = data.buffer + elif hasattr(data, "find") and data.find("sourmash_signature") > 0: + # json string containing the data + if hasattr(data, "encode"): + data = data.encode("utf-8") + else: + if do_raise: + raise ValueError("Can't parse data. No such file or invalid data.") + return + + if ksize is None: + ksize = 0 + + if select_moltype is None: + select_moltype = ffi.NULL + else: + try: + select_moltype = select_moltype.encode("utf-8") except AttributeError: pass - # not a file handle - treat it like a filename. - if not done: - try: - data = _guess_open(data) - is_fp = True - except OSError as excinfo: - if not quiet: error(str(excinfo)) - if do_raise: - raise - return - else: # file-like - if hasattr(data, 'mode'): # file handler - if 't' in data.mode: # need to reopen handler as binary - if sys.version_info >= (3, ): - data = data.buffer + size = ffi.new("uintptr_t *") try: # JSON format - for sig in signature_json.load_signatures_json(data, - ignore_md5sum=ignore_md5sum): - if not ksize or ksize == sig.minhash.ksize: - if not select_moltype or \ - sig.minhash.is_molecule_type(select_moltype): - if select_moltype == 'protein': - if any(sig.minhash.is_molecule_type(t) for t in ('dayhoff', 'hp')): - # dayhoff and hp are also protein MHs. only yield - # sig if it is exactly one of (protein, hp, dayhoff) - continue - yield sig + if is_fp or is_fobj: + # TODO: we still can't pass a file-like object to rust... + try: + buf = data.read() + is_fp = False + is_fobj = False + data.close() + data = buf + except AttributeError: + pass + + if hasattr(data, "encode"): + data = data.encode("utf-8") + + # TODO: use ffi.cast in the future? + # fp_c = ffi.cast("FILE *", data) + # sigs_ptr = rustcall(lib.signatures_load_file, fp_c, ignore_md5sum, size) + + sigs_ptr = rustcall( + lib.signatures_load_buffer, + data, + len(data), + ignore_md5sum, + ksize, + select_moltype, + size, + ) + elif is_filename: + sigs_ptr = rustcall( + lib.signatures_load_path, + data.encode("utf-8"), + ignore_md5sum, + ksize, + select_moltype, + size, + ) + + else: + sigs_ptr = rustcall( + lib.signatures_load_buffer, + data, + len(data), + ignore_md5sum, + ksize, + select_moltype, + size + ) + + size = ffi.unpack(size, 1)[0] + + sigs = [] + for i in range(size): + sig = SourmashSignature._from_objptr(sigs_ptr[i]) + sigs.append(sig) + + for sig in sigs: + yield sig + except Exception as e: if not quiet: error("Error in parsing signature; quitting.") @@ -241,15 +301,14 @@ def load_signatures(data, ksize=None, select_moltype=None, if do_raise: raise finally: - if is_fp: + if is_fp or is_fobj: data.close() -def load_one_signature(data, ksize=None, select_moltype=None, - ignore_md5sum=False): - sigiter = load_signatures(data, ksize=ksize, - select_moltype=select_moltype, - ignore_md5sum=ignore_md5sum) +def load_one_signature(data, ksize=None, select_moltype=None, ignore_md5sum=False): + sigiter = load_signatures( + data, ksize=ksize, select_moltype=select_moltype, ignore_md5sum=ignore_md5sum + ) try: first_sig = next(sigiter) @@ -266,4 +325,24 @@ def load_one_signature(data, ksize=None, select_moltype=None, def save_signatures(siglist, fp=None): "Save multiple signatures into a JSON string (or into file handle 'fp')" - return signature_json.save_signatures_json(siglist, fp) + attached_refs = weakref.WeakKeyDictionary() + collected = [] + for obj in siglist: + rv = obj._get_objptr() + attached_refs[rv] = obj + collected.append(rv) + siglist_c = ffi.new("Signature*[]", collected) + + if fp is None: + buf = rustcall(lib.signatures_save_buffer, siglist_c, len(collected)) + return decode_str(buf, free=True) + else: + # fp_c = ffi.cast("FILE *", fp) + # buf = rustcall(lib.signatures_save_file, siglist_c, len(collected), fp_c) + buf = rustcall(lib.signatures_save_buffer, siglist_c, len(collected)) + result = decode_str(buf, free=True) + try: + fp.write(result) + except TypeError: + fp.write(result.encode('utf-8')) + return None diff --git a/sourmash/signature_json.py b/sourmash/signature_json.py deleted file mode 100644 index 079e2e849c..0000000000 --- a/sourmash/signature_json.py +++ /dev/null @@ -1,310 +0,0 @@ -""" -Extension to sourmash.signature using JSON (making load times of collection of signatures -10 to 20 times faster). -- Laurent Gautier -""" - -# This was written for Python 3, may be there is a chance it will work with Python 2... -from __future__ import print_function, unicode_literals - -import io -import json -import time -try: - import ijson.backends.yajl2 as ijson -except ImportError: - import ijson - - -from . import DEFAULT_SEED, MinHash -from .logging import notify - - -def _json_next_atomic_array(iterable, prefix_item = 'item', ijson = ijson): - """ - - iterable: iterator as returned by ijson.parse - - prefix_item: prefix found for items in the JSON array - - ijson: ijson backend - """ - l = list() - prefix, event, value = next(iterable) - while event != 'start_array': - prefix, event, value = next(iterable) - prefix, event, value = next(iterable) - while event != 'end_array': - #assert prefix == prefix_item - l.append(value) - prefix, event, value = next(iterable) - return tuple(l) - - -def _json_next_signature(iterable, - name = None, - filename = None, - ignore_md5sum=False, - prefix_item='abundances.item', - ijson = ijson): - """Helper function to unpack and check one signature block only. - - iterable: an iterable such the one returned by ijson.parse() - - name: - - filename: - - ignore_md5sum: - - prefix_item: required when parsing nested JSON structures - - ijson: ijson backend to use. - """ - from .signature import SourmashSignature - - d = dict() - prefix, event, value = next(iterable) - if event == 'start_map': - prefix, event, value = next(iterable) - while event != 'end_map': - key = value - if key == 'mins': - value = _json_next_atomic_array(iterable, - prefix_item=prefix_item, ijson=ijson) - elif key == 'abundances': - value = _json_next_atomic_array(iterable, - prefix_item=prefix_item, ijson=ijson) - else: - prefix, event, value = next(iterable) - d[key] = value - prefix, event, value = next(iterable) - - ksize = d['ksize'] - mins = d['mins'] - n = d['num'] - if n == 0xffffffff: # load legacy signatures where n == -1 - n = 0 - max_hash = d.get('max_hash', 0) - seed = d.get('seed', DEFAULT_SEED) - - molecule = d.get('molecule', 'DNA') - if molecule == 'protein': - is_protein = True - dayhoff = False - hp = False - elif molecule == "dayhoff": - is_protein = True - dayhoff = True - hp = False - elif molecule == "hp": - is_protein = True - dayhoff = False - hp = True - elif molecule.upper() == 'DNA': - is_protein = False - dayhoff = False - hp = False - else: - raise Exception("unknown molecule type: {}".format(molecule)) - - - track_abundance = False - if 'abundances' in d: - track_abundance = True - - e = MinHash(ksize=ksize, n=n, is_protein=is_protein, - dayhoff=dayhoff, hp=hp, - track_abundance=track_abundance, - max_hash=max_hash, seed=seed) - - if not track_abundance: - for m in mins: - e.add_hash(m) - else: - abundances = list(map(int, d['abundances'])) - e.set_abundances(dict(zip(mins, abundances))) - - sig = SourmashSignature(e) - - if not ignore_md5sum: - md5sum = d['md5sum'] - if md5sum != sig.md5sum(): - raise Exception('error loading - md5 of minhash does not match') - - if name: - sig.d['name'] = name - if filename: - sig.d['filename'] = filename - - return sig - -def load_signature_json(iterable, - ignore_md5sum=False, - prefix_item='signatures.item.mins.item', - ijson = ijson): - """ - - iterable: an iterable such as the one returned by `ijson.parse()` - - ignore_md5sum: - - prefix_item: prefix required to parse nested JSON structures - - ijson: ijson backend to use - """ - d = dict() - prefix, event, value = next(iterable) - if event != 'start_map': - raise ValueError('expected "start_map".') - - prefix, event, value = next(iterable) - while event != 'end_map': - assert event == 'map_key' - key = value - if key == 'signatures': - signatures = list() - prefix, event, value = next(iterable) - assert event == 'start_array' - while event != 'end_array': - sig = _json_next_signature(iterable, - name = None, - filename = None, - ignore_md5sum=ignore_md5sum, - prefix_item=prefix_item, - ijson=ijson) - signatures.append(sig) - prefix, event, value = next(iterable) - value = signatures - else: - prefix, event, value = next(iterable) - d[key] = value - prefix, event, value = next(iterable) - - # name, and filename not assumed to be parsed before the 'signatures' - for sig in signatures: - if 'name' in d: - sig.d['name'] = d['name'] - if 'filename' in d: - sig.d['filename'] = d['filename'] - - # hardcode in support only for CC0 going forward - if d.get('license', 'CC0') != 'CC0': - raise Exception("sourmash only supports CC0-licensed signatures.") - - sig.d['license'] = d.get('license', 'CC0') - - return d - - -def load_signatureset_json_iter(data, ksize=None, ignore_md5sum=False, ijson=ijson): - """ - - data: file handle (or file handle-like) object - - ksize: - - ignore_md5sum: - - ijson: ijson backend - """ - - parser = ijson.parse(data) - - prefix, event, value = next(parser) - assert prefix == '' and event == 'start_array' and value is None - - n = 0 - while True: - try: - sig = load_signature_json(parser, - prefix_item = 'item.signatures.item.mins.item', - ignore_md5sum=ignore_md5sum, - ijson=ijson) - if not ksize or ksize == sig.minhash.ksize: - yield sig - except ValueError: - # possible end of the array of signatures - try: - prefix, event, value = next(parser) - assert event == 'end_array' - except StopIteration: - pass - finally: - break - n += 1 - -def load_signatures_json(data, ksize=None, ignore_md5sum=True, ijson=ijson): - """ - - data: file handle (or file handle-like) object - - ksize: - - ignore_md5sum: - - ijson: ijson backend - """ - n = 0 - - if isinstance(data, str): - data = io.BytesIO(data.encode('utf-8')) - - it = load_signatureset_json_iter(data, ksize=ksize, - ignore_md5sum=ignore_md5sum, - ijson=ijson) - - for n, sigset in enumerate(it): - if n > 0 and n % 100 == 0: - notify('\r...sig loading {:,}', n, end='', flush=True) - for sig in sigset['signatures']: - yield sig - - if n > 1: - notify('\r...sig loading {:,}', n, flush=True) - - -def add_meta_save(siglist): - """ Convert one signature into a JSON dict - - siglist: sequence of SourmashSignature objects - - index: index of siglist to save - """ - from .signature import SIGNATURE_VERSION - records = [] - top_records = {} - - for sig in siglist: - name, filename, sketch = sig._save() - k = (name, filename) - x = top_records.get(k, []) - x.append(sketch) - top_records[k] = x - - if not top_records: - return records - - for (name, filename), sketches in top_records.items(): - record = {} - if name: - record['name'] = name - if filename: - record['filename'] = filename - record['signatures'] = sketches - - record['version'] = SIGNATURE_VERSION - record['class'] = 'sourmash_signature' - record['hash_function'] = '0.murmur64' - record['license'] = 'CC0' - record['email'] = '' - records.append(record) - - return records - - -def write_records_to_json(records, fp=None, indent=None, sort_keys=True): - s = json.dumps(records, indent=indent, sort_keys=sort_keys, separators=(str(','), str(':'))) - if fp: - try: - fp.write(s) - except TypeError: # fp is opened in binary mode - try: - fp.write(s.encode('utf-8')) - except TypeError: # Python 2 - fp.write(unicode(s)) - return None - return s - - -def save_signatures_json( - siglist, fp=None, indent=None, sort_keys=True): - """ Save multiple signatures into a JSON string (or into file handle 'fp') - - siglist: sequence of SourmashSignature objects - - fp: file handle to the location of a sig file - - indent: indentation spaces (an integer) or if None no indentation - - sort_keys: sort the keys in mappings before writting to JSON - """ - startt = time.time() - records = add_meta_save(siglist) - if records == []: - return "" - s = write_records_to_json(records, fp, indent, sort_keys) - notify("time taken to save signatures is {:.5f} seconds", time.time() - startt, end="\r") - return s diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index e496173737..bc7990d0fd 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -156,7 +156,7 @@ def load_query_signature(filename, ksize, select_moltype): select_moltype=select_moltype, do_raise=True) sl = list(sl) - except IOError: + except (IOError, ValueError): error("Cannot open file '{}'", filename) sys.exit(-1) diff --git a/src/ffi/minhash.rs b/src/ffi/minhash.rs index ae0a01438b..a7d1828457 100644 --- a/src/ffi/minhash.rs +++ b/src/ffi/minhash.rs @@ -3,6 +3,7 @@ use std::os::raw::c_char; use std::slice; use crate::errors::SourmashError; +use crate::ffi::utils::SourmashStr; use crate::signature::SigsTrait; use crate::sketch::minhash::{ aa_to_dayhoff, aa_to_hp, translate_codon, HashFunctions, KmerMinHash, @@ -162,6 +163,18 @@ unsafe fn kmerminhash_get_mins(ptr: *mut KmerMinHash) -> Result<*const u64> { } } +ffi_fn! { +unsafe fn kmerminhash_md5sum(ptr: *mut KmerMinHash) -> Result { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let output = mh.md5sum(); + + Ok(SourmashStr::from_string(output)) +} +} + ffi_fn! { unsafe fn kmerminhash_get_abunds(ptr: *mut KmerMinHash) -> Result<*const u64> { let mh = { diff --git a/src/sketch/minhash.rs b/src/sketch/minhash.rs index ce5ec103b3..0ef77b734c 100644 --- a/src/sketch/minhash.rs +++ b/src/sketch/minhash.rs @@ -28,6 +28,21 @@ pub enum HashFunctions { murmur64_hp = 4, } +impl std::fmt::Display for HashFunctions { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "{}", + match self { + HashFunctions::murmur64_DNA => "dna", + HashFunctions::murmur64_protein => "protein", + HashFunctions::murmur64_dayhoff => "dayhoff", + HashFunctions::murmur64_hp => "hp", + } + ) + } +} + impl TryFrom<&str> for HashFunctions { type Error = Error; @@ -90,21 +105,7 @@ impl Serialize for KmerMinHash { partial.serialize_field("abundances", abunds)?; } - partial.serialize_field( - "molecule", - match &self.is_protein() { - true => { - if self.dayhoff() { - "dayhoff" - } else if self.hp() { - "hp" - } else { - "protein" - } - } - false => "DNA", - }, - )?; + partial.serialize_field("molecule", &self.hash_function.to_string())?; partial.end() } @@ -133,17 +134,32 @@ impl<'de> Deserialize<'de> for KmerMinHash { let hash_function = match tmpsig.molecule.to_lowercase().as_ref() { "protein" => HashFunctions::murmur64_protein, "dayhoff" => HashFunctions::murmur64_dayhoff, + "hp" => HashFunctions::murmur64_hp, "dna" => HashFunctions::murmur64_DNA, _ => unimplemented!(), // TODO: throw error here }; + // This shouldn't be necessary, but at some point we + // created signatures with unordered mins =( + let (mins, abunds) = if let Some(abunds) = tmpsig.abundances { + let mut values: Vec<(_, _)> = tmpsig.mins.iter().zip(abunds.iter()).collect(); + values.sort(); + let mins = values.iter().map(|(v, _)| **v).collect(); + let abunds = values.iter().map(|(_, v)| **v).collect(); + (mins, Some(abunds)) + } else { + let mut values: Vec<_> = tmpsig.mins.into_iter().collect(); + values.sort(); + (values, None) + }; + Ok(KmerMinHash { num, ksize: tmpsig.ksize, seed: tmpsig.seed, max_hash: tmpsig.max_hash, - mins: tmpsig.mins, - abunds: tmpsig.abundances, + mins, + abunds, hash_function, }) } diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 7335c484fe..341703952a 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -668,7 +668,6 @@ def test_mh_merge_check_length2(track_abundance): c = a.merge(b) assert len(c.get_mins()) == 3 - def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) diff --git a/tests/test_signature.py b/tests/test_signature.py index 6a00f7a9e9..4b0ae3d4a5 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -86,7 +86,7 @@ def test_str(track_abundance): assert str(sig) == 'SourmashSignature(59502a74)' assert repr(sig) == 'SourmashSignature(59502a74)' - sig.d['name'] = 'fizbar' + sig._name = 'fizbar' assert str(sig) == 'SourmashSignature(\'fizbar\', 59502a74)' assert repr(sig) == 'SourmashSignature(\'fizbar\', 59502a74)' @@ -191,7 +191,6 @@ def test_md5(track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_hash(5) sig = SourmashSignature(e) - print(sig._save()) assert sig.md5sum() == 'eae27d77ca20db309e056e3d2dcd7d69', sig.md5sum() diff --git a/tests/test_signature_json.py b/tests/test_signature_json.py deleted file mode 100644 index c5628b5d05..0000000000 --- a/tests/test_signature_json.py +++ /dev/null @@ -1,163 +0,0 @@ -import sys -import io -import json -import ijson -import sourmash -from sourmash.signature import SourmashSignature -from sourmash.signature_json import (_json_next_atomic_array, - _json_next_signature, - load_signature_json, - load_signatures_json, - load_signatureset_json_iter, - save_signatures_json) -from collections import OrderedDict - -def test__json_next_atomic_array(): - t = (2,3,4,5,6) - s = json.dumps(t) - if sys.version_info[0] < 3: - s = unicode(s) - it = ijson.parse(io.BytesIO(s.encode('utf-8'))) - a = _json_next_atomic_array(it) - assert len(t) == len(a) - assert all(x == y for x,y in zip(t, a)) - -# integration test more than a unit test... -def test__json_next_signature(): - - name = 'Foo Bar' - filename = '/tmp/foobar' - - minhash = (2,3,4,5,6) - t = OrderedDict((('ksize', 21), - ('num', len(minhash)), - #('md5sum', ), - ('cardinality', 123456), - ('mins', minhash))) - s = json.dumps(t) - if sys.version_info[0] < 3: - s = unicode(s) - it = ijson.parse(io.BytesIO(s.encode('utf-8'))) - # no MD5SUM - sig = _json_next_signature(it, name, filename, - ignore_md5sum=True, - ijson=ijson) - - ## check MD5SUM - minhash = (5,) - t = OrderedDict((('ksize', 20), - ('num', len(minhash)), - ('md5sum', 'eae27d77ca20db309e056e3d2dcd7d69'), - ('cardinality', 123456), - ('mins', minhash))) - s = json.dumps(t) - if sys.version_info[0] < 3: - s = unicode(s) - it = ijson.parse(io.BytesIO(s.encode('utf-8'))) - sig = _json_next_signature(it, name, filename, - ignore_md5sum=False, - ijson=ijson) - -# integration test more than a unit test -def test_load_signature_json(): - name = 'Foo Bar' - filename = '/tmp/foobar' - - minhash = (2,3,4,5,6) - t = OrderedDict((('name', name), - ('filename', filename), - ('signatures', - ( - OrderedDict((('ksize', 21), - ('num', len(minhash)), - #('md5sum', ), - ('cardinality', 123456), - ('mins', minhash))), - )))) - s = json.dumps(t) - if sys.version_info[0] < 3: - s = unicode(s) - it = ijson.parse(io.BytesIO(s.encode('utf-8'))) - # no MD5SUM - sig_entry = load_signature_json(it, ignore_md5sum=True) - -# integration test more than a unit test -def test_load_signaturesset_json_iter(): - - t = list() - for name, filename in (('Foo', '/tmp/foo'), - ('Bar', '/tmp/bar')): - minhash = (2,3,4,5,6) - t.append(OrderedDict(( - ('class', 'sourmash_signature'), - ('name', name), - ('filename', filename), - ('signatures', - ( - OrderedDict((('ksize', 21), - ('num', len(minhash)), - #('md5sum', ), - ('cardinality', 123456), - ('mins', minhash))), - ))))) - - s = json.dumps(t) - if sys.version_info[0] < 3: - s = unicode(s) - # no MD5SUM - sig_entries = tuple(load_signatureset_json_iter(io.BytesIO(s.encode('utf-8')), - ignore_md5sum=True, - ijson=ijson)) - assert len(sig_entries) == 2 - - -# integration test more than a unit test -def test_load_signaturesset_json_iter_molecules(): - - t = list() - molecules = 'DNA', 'protein', 'dayhoff', 'hp' - names = "Foo", 'Bar', "Biz", "Baz" - filenames = '/tmp/foo', '/tmp/bar', '/tmp/biz', '/tmp/baz' - - for molecule, name, filename in zip(molecules, names, filenames): - minhash = (2,3,4,5,6) - t.append(OrderedDict(( - ('class', 'sourmash_signature'), - ('name', name), - ('filename', filename), - ('signatures', - ( - OrderedDict((('ksize', 21), - ('num', len(minhash)), - #('md5sum', ), - ('molecule', molecule), - ('cardinality', 123456), - ('mins', minhash))), - ))))) - - s = json.dumps(t) - if sys.version_info[0] < 3: - s = unicode(s) - # no MD5SUM - sig_entries = tuple(load_signatureset_json_iter(io.BytesIO(s.encode('utf-8')), - ignore_md5sum=True, - ijson=ijson)) - # Ensure all molecule types were read properly - assert len(sig_entries) == 4 - -def test_save_load_multisig_json(): - e1 = sourmash.MinHash(n=1, ksize=20) - sig1 = SourmashSignature(e1) - - e2 = sourmash.MinHash(n=1, ksize=25) - sig2 = SourmashSignature(e2) - - x = save_signatures_json([sig1, sig2]) - y = list(load_signatures_json(x)) - - print(x) - - assert len(y) == 2 - assert sig1 in y # order not guaranteed, note. - assert sig2 in y - assert sig1 != sig2 diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index c9073d4e4e..c1df69d891 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -3139,7 +3139,7 @@ def test_license_cc0(): sig = next(signature.load_signatures(sigfile)) assert sig.name().endswith('short.fa') - assert sig.d['license'] == 'CC0' + assert sig.license == 'CC0' def test_license_non_cc0():