diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index d8aeeb7877..819737e8e2 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -1,3 +1,4 @@ +# note: to invalidate caches, adjust the pip-v? and tox-v? numbers below. name: Python tests on: @@ -35,9 +36,9 @@ jobs: uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }} + key: ${{ runner.os }}-pip-v2-${{ hashFiles('**/setup.cfg') }} restore-keys: | - ${{ runner.os }}-pip- + ${{ runner.os }}-pip-v2- - name: Install dependencies run: | @@ -64,9 +65,9 @@ jobs: uses: actions/cache@v3 with: path: .tox/ - key: ${{ runner.os }}-tox-${{ hashFiles('**/setup.py') }} + key: ${{ runner.os }}-tox-v2-${{ hashFiles('**/setup.cfg') }} restore-keys: | - ${{ runner.os }}-tox- + ${{ runner.os }}-tox-v2- - name: Test with tox run: tox diff --git a/doc/command-line.md b/doc/command-line.md index 88cae603cf..3d6212c769 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -727,7 +727,7 @@ database. It can be used to combine multiple taxonomies into a single file, as well as change formats between CSV and sqlite3. The following command will take in two taxonomy files and combine them into -a single taxonomy sqlite database. +a single taxonomy SQLite database. ``` sourmash tax prepare --taxonomy file1.csv file2.csv -o tax.db @@ -931,6 +931,15 @@ As of sourmash 4.2.0, `lca index` supports `--picklist`, to can be used to index a subset of a large collection, or to exclude a few signatures from an index being built from a large collection. +As of sourmash 4.4.0, `lca index` can produce an _on disk_ LCA +database using SQLite. To prepare such a database, use +`sourmash lca index ... -F sql`. + +All sourmash commands work with either type of LCA database (the +default JSON database, and the SQLite version). SQLite databases are +larger than JSON databases on disk but are typically much faster +to load and search, and use much less memory. + ### `sourmash lca rankinfo` - examine an LCA database The `sourmash lca rankinfo` command displays k-mer specificity @@ -1399,6 +1408,14 @@ iterating over the signatures in the input file. This can be slow for large collections. Use `--no-rebuild-manifest` to load an existing manifest if it is available. +As of sourmash 4.4.0, `sig manifest` can produce a manifest in a fast +on-disk format (a SQLite database). SQLite manifests can be _much_ +faster when working with very large collections of signatures. +To produce a SQLite manifest, use `sourmash sig manifest ... -F sql`. + +All sourmash commands that work with manifests will accept both +CSV and SQLite manifest files. + ### `sourmash signature check` - compare picklists and manifests Compare picklists and manifests across databases, and optionally output matches @@ -1452,7 +1469,7 @@ Briefly, None of these commands currently support searching, comparing, or indexing signatures with multiple ksizes or moltypes at the same time; you need -to pick the ksize and moltype to use for your search. Where possible, +to pick the ksize and moltype to use for your query. Where possible, scaled values will be made compatible. ### Selecting signatures @@ -1549,9 +1566,10 @@ In addition to `sig extract`, the following commands support ### Storing (and searching) signatures Backing up a little, there are many ways to store and search -signatures. `sourmash` supports storing and loading signatures from JSON -files, directories, lists of files, Zip files, and indexed databases. -These can all be used interchangeably for sourmash operations. +signatures. `sourmash` supports storing and loading signatures from +JSON files, directories, lists of files, Zip files, custom indexed +databases, and SQLite databases. These can all be used +interchangeably for most sourmash operations. The simplest is one signature in a single JSON file. You can also put many signatures in a single JSON file, either by building them that @@ -1567,7 +1585,7 @@ signatures from zip files. You can create a compressed collection of signatures using `zip -r collection.zip *.sig` and then specify `collections.zip` on the command line. -### Saving signatures, more generally +### Choosing signature output formats (sourmash v4.1 and later) @@ -1583,6 +1601,7 @@ This behavior is triggered by the requested output filename -- * to save to gzipped JSON signature files, use `.sig.gz`; * to save to a Zip file collection, use `.zip`; * to save signature files to a directory, use a name ending in `/`; the directory will be created if it doesn't exist; +* to save to a SQLite database, use `.sqldb` (as of sourmash v4.4.0). If none of these file extensions is detected, output will be written in the JSON `.sig` format, either to the provided output filename or @@ -1614,22 +1633,36 @@ Indexed databases can make searching signatures much faster. SBT databases are low memory and disk-intensive databases that allow for fast searches using a tree structure, while LCA databases are higher memory and (after a potentially significant load time) are quite fast. +SQLite databases (new in sourmash v4.4.0) are typically larger on disk +than SBTs and LCAs, but in turn are fast to load and support very low +memory search. (LCA databases also directly permit taxonomic searches using `sourmash lca` functions.) Commands that take multiple signatures or collections of signatures -will also work with databases. +will also work with indexed databases. -One limitation of indexed databases is that both SBT and LCA database -can only contain one "type" of signature (one ksize/one moltype at one -scaled value). If the database signature type is incompatible with the -other signatures, sourmash will complain appropriately. +One limitation of indexed databases is that they are all restricted in +to certain kinds of signatures. Both SBT and LCA databases can only +contain one "type" of signature (one ksize/one moltype at one scaled +value). SQLite databases can contain multiple ksizes and moltypes, but +only at one scaled value. If the database signature type is +incompatible with the other signatures, sourmash will complain +appropriately. In contrast, signature files, zip collections, and directory hierarchies can contain many different types of signatures, and compatible ones will be selected automatically. +Use the `sourmash index` command to create an SBT. + +Use the `sourmash lca index` command to create an LCA database; the +database can be saved in JSON or SQL format with `-F json` or `-F sql`. + +Use `sourmash sig cat -o .sqldb` to create +a SQLite indexed database. + ### Combining search databases on the command line All of the commands in sourmash operate in "online" mode, so you can diff --git a/setup.cfg b/setup.cfg index 0ce8786108..17b86f9252 100644 --- a/setup.cfg +++ b/setup.cfg @@ -43,6 +43,7 @@ install_requires = scipy deprecation>=2.0.6 cachetools>=4,<6 + bitstring>=3.1.9,<4 python_requires = >=3.8 [bdist_wheel] diff --git a/src/sourmash/cli/lca/index.py b/src/sourmash/cli/lca/index.py index fd205b6f9e..14c6cca1b2 100644 --- a/src/sourmash/cli/lca/index.py +++ b/src/sourmash/cli/lca/index.py @@ -59,6 +59,12 @@ def subparser(subparsers): '--fail-on-missing-taxonomy', action='store_true', help='fail quickly if taxonomy is not available for an identifier', ) + subparser.add_argument( + '-F', '--database-format', + help="format of output database; default is 'json')", + default='json', + choices=['json', 'sql'], + ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) diff --git a/src/sourmash/cli/search.py b/src/sourmash/cli/search.py index b05be22c84..744dec34c1 100644 --- a/src/sourmash/cli/search.py +++ b/src/sourmash/cli/search.py @@ -56,6 +56,10 @@ def subparser(subparsers): '-q', '--quiet', action='store_true', help='suppress non-error output' ) + subparser.add_argument( + '-d', '--debug', action='store_true', + help='output debug information' + ) subparser.add_argument( '--threshold', metavar='T', default=0.08, type=float, help='minimum threshold for reporting matches; default=0.08' diff --git a/src/sourmash/cli/sig/check.py b/src/sourmash/cli/sig/check.py index e218850d19..d8308173aa 100644 --- a/src/sourmash/cli/sig/check.py +++ b/src/sourmash/cli/sig/check.py @@ -55,6 +55,13 @@ def subparser(subparsers): help='do not require a manifest; generate dynamically if needed', action='store_true' ) + subparser.add_argument( + '-F', '--manifest-format', + help="format of manifest output file; default is 'csv')", + default='csv', + choices=['csv', 'sql'], + ) + add_ksize_arg(subparser, 31) add_moltype_args(subparser) add_pattern_args(subparser) diff --git a/src/sourmash/cli/sig/manifest.py b/src/sourmash/cli/sig/manifest.py index 0562ee2c5d..e066dbda67 100644 --- a/src/sourmash/cli/sig/manifest.py +++ b/src/sourmash/cli/sig/manifest.py @@ -40,7 +40,12 @@ def subparser(subparsers): '--no-rebuild-manifest', help='use existing manifest if available', action='store_true' ) - + subparser.add_argument( + '-F', '--manifest-format', + help="format of manifest output file; default is 'csv')", + default='csv', + choices=['csv', 'sql'], + ) def main(args): import sourmash diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 710dfa2f58..1f9b56f441 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -441,7 +441,7 @@ def search(args): from .search import (search_databases_with_flat_query, search_databases_with_abund_query) - set_quiet(args.quiet) + set_quiet(args.quiet, args.debug) moltype = sourmash_args.calculate_moltype(args) picklist = sourmash_args.load_picklist(args) pattern_search = sourmash_args.load_include_exclude_db_patterns(args) diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index 770c677f5c..d575132437 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -868,6 +868,15 @@ class MultiIndex(Index): Note: this is an in-memory collection, and does not do lazy loading: all signatures are loaded upon instantiation and kept in memory. + There are a variety of loading functions: + * `load` takes a list of already-loaded Index objects, + together with a list of their locations. + * `load_from_directory` traverses a directory to load files within. + * `load_from_path` takes an arbitrary pathname and tries to load it + as a directory, or as a .sig file. + * `load_from_pathlist` takes a text file full of pathnames and tries + to load them all. + Concrete class; signatures held in memory; builds and uses manifests. """ def __init__(self, manifest, parent, *, prepend_location=False): @@ -1212,8 +1221,7 @@ def load(cls, location, *, prefix=None): if not os.path.isfile(location): raise ValueError(f"provided manifest location '{location}' is not a file") - with open(location, newline='') as fp: - m = CollectionManifest.load_from_csv(fp) + m = CollectionManifest.load_from_filename(location) if prefix is None: prefix = os.path.dirname(location) @@ -1245,20 +1253,12 @@ def _signatures_with_internal(self): manifest in this class. """ # collect all internal locations - iloc_to_rows = defaultdict(list) - for row in self.manifest.rows: - iloc = row['internal_location'] - iloc_to_rows[iloc].append(row) - - # iterate over internal locations, selecting relevant sigs - for iloc, iloc_rows in iloc_to_rows.items(): - # prepend with prefix? + picklist = self.manifest.to_picklist() + for iloc in self.manifest.locations(): + # prepend location with prefix? if not iloc.startswith('/') and self.prefix: iloc = os.path.join(self.prefix, iloc) - sub_mf = CollectionManifest(iloc_rows) - picklist = sub_mf.to_picklist() - idx = sourmash.load_file_as_index(iloc) idx = idx.select(picklist=picklist) for ss in idx.signatures(): diff --git a/src/sourmash/index/sqlite_index.py b/src/sourmash/index/sqlite_index.py new file mode 100644 index 0000000000..9fc84ecb0f --- /dev/null +++ b/src/sourmash/index/sqlite_index.py @@ -0,0 +1,1092 @@ +"""sqlite3 based Index, CollectionManifest, and LCA_Database +implementations. + +These classes support a variety of flexible and fast on-disk storage, +search, and retrieval functions. + +SqliteIndex stores full scaled signatures; sketches are stored as +reverse-indexed collections of hashes. Search is optimized via the +reverse index. Num and abund sketches are not supported. All scaled +values must be the same upon insertion. Multiple moltypes _are_ +supported. + +SqliteCollectionManifest provides a full implementation of the +manifest API. It can store details for all signature types. When used +as part of a SqliteIndex database, it does not support independent +insertion. + +LCA_SqliteDatabase builds on top of SqliteIndex and LineageDB_Sqlite +(in the tax submodule) to provide a full on-disk implementation of +LCA_Database. + +Using these classes +------------------- + +These classes are fully integrated into sourmash loading. + +Internally, use `sqlite_index.load_sqlite_index(...)` to load a specific +file; this will return the appropriate SqliteIndex, StandaloneManifestIndex, +or LCA_Database object. + +Use `CollectionManifest.load_from_filename(...)` to load the manifest +directly as a manifest object. + +Implementation Details +---------------------- + +SqliteIndex: + +* Hashes with values above MAX_SQLITE_INT=2**63-1 are transformed into + signed long longs upon insertion, and then back into ulong longs upon + retrieval. + +* Hash overlap is calculated via a SELECT. + +* SqliteIndex relies on SqliteCollectionManifest for manifest functionality, + including signature selection and picklists. + +SqliteCollectionManifest: + +* each object maintains info about whether it is being "managed" by a + SqliteIndex class or not. If it is, `_insert_row(...)` cannot be + called directly. + +* `select(...)` operates directly with SQL queries, except for + picklist selection, which involves inspect each manifest row in + Python. In addition to being (much) simpler, this ends up being + faster in some important real world situations, even for millions of + rows! + +* filter_on_rows and filter_on_columns also both operate in Python, + not SQL. + +* for this reason, the `locations()` method returns a superset of + locations. This is potentially very significant if you do a select + with a picklist that ignores most sketches - the `locations()` + method will ignore the picklist. + +Limitations: + +* all of these classes share a single connection object, and it could + get confusing quickly if you simultaneously insert and query. We suggest + separating creation and insertion. That having been said, these databases + should work fine for many simultaneous queries; just don't write :). + +""" +import time +import os +import sqlite3 +from collections import defaultdict +import itertools + +from bitstring import BitArray + +from sourmash.index import Index +from sourmash.exceptions import IndexNotSupported +from sourmash import MinHash, SourmashSignature +from sourmash.index import IndexSearchResult, StandaloneManifestIndex +from sourmash.picklist import SignaturePicklist +from sourmash.logging import debug_literal +from sourmash import sqlite_utils + +from sourmash.lca.lca_db import cached_property +from sourmash.manifest import BaseCollectionManifest + +# converters for unsigned 64-bit ints: if over MAX_SQLITE_INT, +# convert to signed int. + +MAX_SQLITE_INT = 2 ** 63 - 1 +convert_hash_to = lambda x: BitArray(uint=x, length=64).int if x > MAX_SQLITE_INT else x +convert_hash_from = lambda x: BitArray(int=x, length=64).uint if x < 0 else x + + +def load_sqlite_index(filename, *, request_manifest=False): + """Load a SqliteIndex, SqliteCollectionManifest, or LCA_SqliteDatabase. + + This is the main top-level API for loading an Index-like object. The logic + is roughly: + + * does this database have both index and lineage tables? If so, + return an LCA_SqliteDatabase. + * if it only has an index, return a SqliteIndex. + * if it only has a manifest, return a StandaloneManifestIndex. + + If you would like only a manifest, specify 'request_manifest=True'. + """ + conn = sqlite_utils.open_sqlite_db(filename) + + if conn is None: + debug_literal("load_sqlite_index: conn is None.") + return + + c = conn.cursor() + internal_d = sqlite_utils.get_sourmash_internal(c) + + is_index = False + is_manifest = False + is_lca_db = False + + if 'SqliteIndex' in internal_d: + v = internal_d['SqliteIndex'] + if v != '1.0': + raise IndexNotSupported + is_index = True + debug_literal("load_sqlite_index: it's an index!") + + if is_index and 'SqliteLineage' in internal_d: + v = internal_d['SqliteLineage'] + if v != '1.0': + raise IndexNotSupported + + is_lca_db = True + debug_literal("load_sqlite_index: it's got a lineage table!") + + if internal_d['SqliteManifest']: + v = internal_d['SqliteManifest'] + if v != '1.0': + raise IndexNotSupported + is_manifest = True + debug_literal(f"load_sqlite_index: it's a manifest! request_manifest: {request_manifest}") + + # every Index is a Manifest! + if is_index or is_lca_db: + assert is_manifest + + idx = None + if is_index and not request_manifest: + conn.close() + + if is_lca_db: + debug_literal("load_sqlite_index: returning LCA_SqliteDatabase") + idx = LCA_SqliteDatabase.load(filename) + else: + debug_literal("load_sqlite_index: returning SqliteIndex") + idx = SqliteIndex(filename) + elif is_manifest: + managed_by_index=False + if is_index: + assert request_manifest + managed_by_index=True + + prefix = os.path.dirname(filename) + mf = SqliteCollectionManifest(conn, managed_by_index=managed_by_index) + idx = StandaloneManifestIndex(mf, filename, prefix=prefix) + debug_literal("load_sqlite_index: returning StandaloneManifestIndex") + + return idx + + +class SqliteIndex(Index): + is_database = True + + # NOTE: we do not need _signatures_with_internal for this class + # because it supplies a manifest directly :tada:. + + def __init__(self, dbfile, *, sqlite_manifest=None, conn=None): + "Constructor. 'dbfile' should be valid filename or ':memory:'." + self.dbfile = dbfile + + # no connection? connect and/or create! + if conn is None: + conn = self._open(dbfile) + + # build me a SQLite manifest class to use for selection. + if sqlite_manifest is None: + sqlite_manifest = SqliteCollectionManifest(conn, + managed_by_index=True) + self.manifest = sqlite_manifest + self.conn = conn + + # set 'scaled'. + c = self.conn.cursor() + c.execute("SELECT DISTINCT scaled FROM sourmash_sketches") + scaled_vals = c.fetchall() + if len(scaled_vals) > 1: + raise ValueError("this database has multiple scaled values, which is not currently allowed") + + if scaled_vals: + self.scaled = scaled_vals[0][0] + else: + self.scaled = None + + @classmethod + def _open(cls, dbfile, *, empty_ok=True): + "Connect to existing SQLite database or create new." + try: + conn = sqlite3.connect(dbfile) + c = conn.cursor() + + c.execute("PRAGMA cache_size=10000000") + c.execute("PRAGMA synchronous = OFF") + c.execute("PRAGMA journal_mode = MEMORY") + c.execute("PRAGMA temp_store = MEMORY") + + if not empty_ok: + c.execute("SELECT * FROM sourmash_hashes LIMIT 1") + c.fetchone() + except (sqlite3.OperationalError, sqlite3.DatabaseError): + raise ValueError(f"cannot open '{dbfile}' as SqliteIndex database") + + return conn + + @classmethod + def load(self, dbfile): + "Load an existing SqliteIndex from dbfile." + return SqliteIndex(dbfile) + + @classmethod + def create(cls, dbfile, *, append=False): + "Create a new SqliteIndex in dbfile." + conn = cls._open(dbfile, empty_ok=True) + cls._create_tables(conn.cursor(), ignore_exists=append) + conn.commit() + + return cls(dbfile, conn=conn) + + @classmethod + def _create_tables(cls, c, *, ignore_exists=False): + "Create sqlite tables for SqliteIndex" + try: + sqlite_utils.add_sourmash_internal(c, 'SqliteIndex', '1.0') + SqliteCollectionManifest._create_tables(c) + + c.execute(""" + CREATE TABLE IF NOT EXISTS sourmash_hashes ( + hashval INTEGER NOT NULL, + sketch_id INTEGER NOT NULL, + FOREIGN KEY (sketch_id) REFERENCES sourmash_sketches (id) + ) + """) + c.execute(""" + CREATE INDEX IF NOT EXISTS sourmash_hashval_idx ON sourmash_hashes ( + hashval, + sketch_id + ) + """) + c.execute(""" + CREATE INDEX IF NOT EXISTS sourmash_hashval_idx2 ON sourmash_hashes ( + hashval + ) + """) + c.execute(""" + CREATE INDEX IF NOT EXISTS sourmash_sketch_idx ON sourmash_hashes ( + sketch_id + ) + """ + ) + except (sqlite3.OperationalError, sqlite3.DatabaseError): + if not ignore_exists: + raise ValueError("cannot create SqliteIndex tables") + + return c + + def cursor(self): + return self.conn.cursor() + + def close(self): + self.conn.close() + + def commit(self): + self.conn.commit() + + def __len__(self): + return len(self.manifest) + + def insert(self, ss, *, cursor=None, commit=True): + """ + Insert a signature into the sqlite database. + + If a cursor object is supplied, use that cursor instead of + generating a new one. + + If 'commit' is True, commit after add; otherwise, do not. + """ + if cursor: + c = cursor + else: + c = self.conn.cursor() + + if ss.minhash.num: + raise ValueError("cannot store 'num' signatures in SqliteIndex") + if ss.minhash.track_abundance: + raise ValueError("cannot store signatures with abundance in SqliteIndex") + + if self.scaled is not None and self.scaled != ss.minhash.scaled: + raise ValueError(f"this database can only store scaled values={self.scaled}") + elif self.scaled is None: + self.scaled = ss.minhash.scaled + + # ok, first create and insert a manifest row + row = BaseCollectionManifest.make_manifest_row(ss, None, + include_signature=False) + self.manifest._insert_row(c, row, call_is_from_index=True) + + # retrieve ID of row for retrieving hashes: + c.execute("SELECT last_insert_rowid()") + sketch_id, = c.fetchone() + + # insert all the hashes + hashes_to_sketch = [] + for h in ss.minhash.hashes: + hh = convert_hash_to(h) + hashes_to_sketch.append((hh, sketch_id)) + + c.executemany("INSERT INTO sourmash_hashes (hashval, sketch_id) VALUES (?, ?)", + hashes_to_sketch) + + if commit: + self.conn.commit() + + @property + def location(self): + return self.dbfile + + def signatures(self): + "Return an iterator over all signatures in the Index object." + for ss, loc in self.signatures_with_location(): + yield ss + + def signatures_with_location(self): + "Return an iterator over tuples (signature, location) in the Index." + c = self.conn.cursor() + + for ss, loc, iloc in self._load_sketches(c): + yield ss, loc + + def save(self, *args, **kwargs): + raise NotImplementedError + + def find(self, search_fn, query, **kwargs): + search_fn.check_is_compatible(query) + + # check compatibility, etc. + query_mh = query.minhash + if self.scaled > query_mh.scaled: + query_mh = query_mh.downsample(scaled=self.scaled) + + picklist = None + if self.manifest.selection_dict: + picklist = self.manifest.selection_dict.get('picklist') + + c1 = self.conn.cursor() + c2 = self.conn.cursor() + + debug_literal('running _get_matching_sketches...') + t0 = time.time() + xx = self._get_matching_sketches(c1, query_mh.hashes, + query_mh._max_hash) + for sketch_id, n_matching_hashes in xx: + debug_literal(f"...got sketch {sketch_id}, with {n_matching_hashes} matching hashes in {time.time() - t0:.2f}") + # + # first, estimate sketch size using sql results. + # + query_size = len(query_mh) + subj_size = self._load_sketch_size(c2, sketch_id, + query_mh._max_hash) + total_size = query_size + subj_size - n_matching_hashes + shared_size = n_matching_hashes + + score = search_fn.score_fn(query_size, shared_size, subj_size, + total_size) + + debug_literal(f"APPROX RESULT: score={score} qsize={query_size}, ssize={subj_size} total={total_size} overlap={shared_size}") + + # do we pass? + if not search_fn.passes(score): + debug_literal(f"FAIL score={score}") + + # CTB if we are doing containment only, we could break loop here. + # but for Jaccard, we must continue. + # see 'test_sqlite_jaccard_ordering' + + if search_fn.passes(score): + subj = self._load_sketch(c2, sketch_id) + if search_fn.collect(score, subj): + if picklist is None or subj in picklist: + yield IndexSearchResult(score, subj, self.location) + + def select(self, *, num=0, track_abundance=False, **kwargs): + "Run a select! This just modifies the manifest." + # check SqliteIndex specific conditions on the 'select' + if num: + raise ValueError("cannot select on 'num' in SqliteIndex") + if track_abundance: + raise ValueError("cannot store or search signatures with abundance") + # create manifest if needed + manifest = self.manifest + if manifest is None: + manifest = SqliteCollectionManifest(self.conn, + managed_by_index=True) + + # modify manifest + manifest = manifest.select_to_manifest(**kwargs) + + # return a new SqliteIndex with a new manifest, but same old conn. + return SqliteIndex(self.dbfile, + sqlite_manifest=manifest, + conn=self.conn) + + # + # Actual SQL queries, etc. + # + + def _load_sketch_size(self, c1, sketch_id, max_hash): + "Get sketch size for given sketch, downsampled by max_hash." + if max_hash <= MAX_SQLITE_INT: + c1.execute(""" + SELECT COUNT(hashval) FROM sourmash_hashes + WHERE sketch_id=? AND hashval >= 0 AND hashval <= ?""", + (sketch_id, max_hash)) + else: + c1.execute('SELECT COUNT(hashval) FROM sourmash_hashes WHERE sketch_id=?', + (sketch_id,)) + + n_hashes, = c1.fetchone() + return n_hashes + + def _load_sketch(self, c, sketch_id, *, match_scaled=None): + "Load an individual sketch. If match_scaled is set, downsample." + + start = time.time() + c.execute(""" + SELECT id, name, scaled, ksize, filename, moltype, seed + FROM sourmash_sketches WHERE id=?""", (sketch_id,)) + debug_literal(f"load sketch {sketch_id}: got sketch info in {time.time() - start:.2f}") + + sketch_id, name, scaled, ksize, filename, moltype, seed = c.fetchone() + if match_scaled is not None: + scaled = max(scaled, match_scaled) + + is_protein = 1 if moltype=='protein' else 0 + is_dayhoff = 1 if moltype=='dayhoff' else 0 + is_hp = 1 if moltype=='hp' else 0 + + mh = MinHash(n=0, ksize=ksize, scaled=scaled, seed=seed, + is_protein=is_protein, dayhoff=is_dayhoff, hp=is_hp) + + + template_values = [sketch_id] + + hash_constraint_str = "" + max_hash = mh._max_hash + if max_hash <= MAX_SQLITE_INT: + hash_constraint_str = "sourmash_hashes.hashval >= 0 AND sourmash_hashes.hashval <= ? AND" + template_values.insert(0, max_hash) + else: + debug_literal('NOT EMPLOYING hash_constraint_str') + + debug_literal(f"finding hashes for sketch {sketch_id} in {time.time() - start:.2f}") + c.execute(f"SELECT hashval FROM sourmash_hashes WHERE {hash_constraint_str} sourmash_hashes.sketch_id=?", template_values) + + debug_literal(f"loading hashes for sketch {sketch_id} in {time.time() - start:.2f}") + for hashval, in c: + hh = convert_hash_from(hashval) + mh.add_hash(hh) + + debug_literal(f"done loading sketch {sketch_id} {time.time() - start:.2f})") + + return SourmashSignature(mh, name=name, filename=filename) + + def _load_sketches(self, c): + "Load sketches based on manifest _id column." + for row in self.manifest.rows: + sketch_id = row['_id'] + assert row['num'] == 0 + + moltype = row['moltype'] + is_protein = 1 if moltype=='protein' else 0 + is_dayhoff = 1 if moltype=='dayhoff' else 0 + is_hp = 1 if moltype=='hp' else 0 + + ksize = row['ksize'] + scaled = row['scaled'] + seed = row['seed'] + + mh = MinHash(n=0, ksize=ksize, scaled=scaled, seed=seed, + is_protein=is_protein, dayhoff=is_dayhoff, hp=is_hp) + + c.execute("SELECT hashval FROM sourmash_hashes WHERE sketch_id=?", + (sketch_id,)) + + for hashval, in c: + mh.add_hash(convert_hash_from(hashval)) + + ss = SourmashSignature(mh, name=row['name'], + filename=row['filename']) + yield ss, self.dbfile, sketch_id + + def _get_matching_sketches(self, c, hashes, max_hash): + """ + For hashvals in 'hashes', retrieve all matching sketches, + together with the number of overlapping hashes for each sketch. + + CTB: we do not use sqlite manifest conditions on this select, + because it slows things down in practice. + """ + c.execute("DROP TABLE IF EXISTS sourmash_hash_query") + c.execute("CREATE TEMPORARY TABLE sourmash_hash_query (hashval INTEGER PRIMARY KEY)") + + hashvals = [ (convert_hash_to(h),) for h in hashes ] + c.executemany("INSERT OR IGNORE INTO sourmash_hash_query (hashval) VALUES (?)", + hashvals) + + # + # set up SELECT conditions + # + + conditions = [] + template_values = [] + + # downsample? => add to conditions + max_hash = min(max_hash, max(hashes)) + if max_hash <= MAX_SQLITE_INT: + select_str = "sourmash_hashes.hashval >= 0 AND sourmash_hashes.hashval <= ?" + conditions.append(select_str) + template_values.append(max_hash) + + # format conditions + conditions.append('sourmash_hashes.hashval=sourmash_hash_query.hashval') + conditions = " AND ".join(conditions) + + c.execute(f""" + SELECT DISTINCT sourmash_hashes.sketch_id,COUNT(sourmash_hashes.hashval) as CNT + FROM sourmash_hashes, sourmash_hash_query + WHERE {conditions} + GROUP BY sourmash_hashes.sketch_id ORDER BY CNT DESC + """, template_values) + + return c + + +class SqliteCollectionManifest(BaseCollectionManifest): + """ + A SQLite-based manifest, used both for SqliteIndex and as a standalone + manifest class. + + This class serves two purposes: + * first, it is a fast, on-disk manifest that can be used in place of + CollectionManifest. + * second, it can be included within a SqliteIndex (which stores hashes + too). In this case, however, new entries must be inserted by SqliteIndex + rather than directly in this class. + + In the latter case, the SqliteCollectionManifest is created with + managed_by_index set to True. + """ + def __init__(self, conn, *, selection_dict=None, managed_by_index=False): + """ + Here, 'conn' should already be connected and configured. + + Use 'create(filename)' to create a new database. + + Use 'create_from_manifest(filename, manifest) to create a new db + from an existing manifest object. + + Use 'load_from_filename' to load from file. + """ + assert conn is not None + self.conn = conn + self.selection_dict = selection_dict + self.managed_by_index = managed_by_index + self._num_rows = None + + @classmethod + def create(cls, filename): + "Connect to 'filename' and create the tables as a standalone manifest." + conn = sqlite3.connect(filename) + cursor = conn.cursor() + cls._create_tables(cursor) + return cls(conn) + + @classmethod + def load_from_manifest(cls, manifest, *, dbfile=":memory:", append=False): + "Create a new sqlite manifest from an existing manifest object." + return cls._create_manifest_from_rows(manifest.rows, location=dbfile, + append=append) + + @classmethod + def create_manifest(cls, locations_iter, *, include_signature=False): + """Create a manifest from an iterator that yields (ss, location) + + Stores signatures in manifest rows by default. + + Note: do NOT catch exceptions here, so this passes through load excs. + Note: this method ignores 'include_signature'. + """ + def rows_iter(): + for ss, location in locations_iter: + row = cls.make_manifest_row(ss, location, + include_signature=False) + yield row + + return cls._create_manifest_from_rows(rows_iter()) + + @classmethod + def _create_tables(cls, cursor): + "Create the manifest table." + # this is a class method so that it can be used by SqliteIndex to + # create manifest-compatible tables. + + sqlite_utils.add_sourmash_internal(cursor, 'SqliteManifest', '1.0') + cursor.execute(""" + CREATE TABLE sourmash_sketches + (id INTEGER PRIMARY KEY, + name TEXT, + num INTEGER NOT NULL, + scaled INTEGER NOT NULL, + ksize INTEGER NOT NULL, + filename TEXT, + moltype TEXT NOT NULL, + with_abundance BOOLEAN NOT NULL, + md5sum TEXT NOT NULL, + seed INTEGER NOT NULL, + n_hashes INTEGER NOT NULL, + internal_location TEXT, + UNIQUE(internal_location, md5sum) + ) + """) + + def _insert_row(self, cursor, row, *, call_is_from_index=False): + "Insert a new manifest row." + # check - is this manifest managed by SqliteIndex? If so, prevent + # insertions unless SqliteIndex is the one calling it. + if self.managed_by_index and not call_is_from_index: + raise Exception("must use SqliteIndex.insert to add to this manifest") + + row = dict(row) + if 'seed' not in row: + row['seed'] = 42 + + cursor.execute(""" + INSERT OR IGNORE INTO sourmash_sketches + (name, num, scaled, ksize, filename, md5sum, moltype, + seed, n_hashes, with_abundance, internal_location) + VALUES (:name, :num, :scaled, :ksize, :filename, :md5, + :moltype, :seed, :n_hashes, :with_abundance, + :internal_location)""", row) + + self._num_rows = None # reset cache + + def __bool__(self): + "Is this manifest empty?" + if self._num_rows is not None: + return bool(self._num_rows) + + try: + next(iter(self.rows)) + return True + except StopIteration: + return False + + def __eq__(self, other): + "Check equality on a row-by-row basis. May fail on out-of-order rows." + for (a, b) in itertools.zip_longest(self.rows, other.rows): + # ignore non-required keys. + for k in self.required_keys: + if a[k] != b[k]: + return False + + return True + + def __len__(self): + "Number of rows." + + # can we use cached value? + if self._num_rows is not None: + return self._num_rows + + # self.rows is a generator, so can't use 'len' + self._num_rows = sum(1 for _ in self.rows) + return self._num_rows + + def _make_select(self): + """Build a set of SQL SELECT conditions and matching value tuple + that can be used to select the right sketches from the + database. + + Returns a triple 'conditions', 'values', and 'picklist'. + 'conditions' is a list that should be joined with 'AND'. + + The picklist is simply retrieved from the selection dictionary. + """ + conditions = [] + values = [] + picklist = None + if self.selection_dict: + select_d = self.selection_dict + if 'ksize' in select_d and select_d['ksize']: + conditions.append("sourmash_sketches.ksize = ?") + values.append(select_d['ksize']) + if 'num' in select_d and select_d['num'] > 0: + conditions.append("sourmash_sketches.num > 0") + if 'scaled' in select_d and select_d['scaled'] > 0: + conditions.append("sourmash_sketches.scaled > 0") + if 'containment' in select_d and select_d['containment']: + conditions.append("sourmash_sketches.scaled > 0") + if 'moltype' in select_d and select_d['moltype'] is not None: + moltype = select_d['moltype'] + assert moltype in ('DNA', 'protein', 'dayhoff', 'hp'), moltype + conditions.append(f"sourmash_sketches.moltype = '{moltype}'") + + picklist = select_d.get('picklist') + + return conditions, values, picklist + + def select_to_manifest(self, **kwargs): + "Create a new SqliteCollectionManifest with the given select args." + # Pass along all the selection kwargs to a new instance + if self.selection_dict: + debug_literal("sqlite manifest: merging selection dicts") + # combine selects... + d = dict(self.selection_dict) + for k, v in kwargs.items(): + if k in d: + if d[k] is not None and d[k] != v: + raise ValueError(f"incompatible select on '{k}'") + d[k] = v + kwargs = d + + new_mf = SqliteCollectionManifest(self.conn, selection_dict=kwargs) + + # if picklist, make sure we fill in 'found'. + picklist = kwargs.get('picklist') + if picklist is not None: + debug_literal("sqlite manifest: iterating through picklist") + _ = len(self) # this forces iteration through rows. + + return new_mf + + @property + def rows(self): + "Return rows that match the selection." + c1 = self.conn.cursor() + + conditions, values, picklist = self._make_select() + if conditions: + conditions = conditions = "WHERE " + " AND ".join(conditions) + else: + conditions = "" + + debug_literal(f"sqlite manifest rows: executing select with '{conditions}'") + c1.execute(f""" + SELECT id, name, md5sum, num, scaled, ksize, filename, moltype, + seed, n_hashes, internal_location FROM sourmash_sketches {conditions} + """, values) + + debug_literal("sqlite manifest: entering row yield loop") + for (_id, name, md5sum, num, scaled, ksize, filename, moltype, + seed, n_hashes, iloc) in c1: + row = dict(num=num, scaled=scaled, name=name, filename=filename, + n_hashes=n_hashes, with_abundance=0, ksize=ksize, + md5=md5sum, internal_location=iloc, + moltype=moltype, md5short=md5sum[:8], + seed=seed, _id=_id) + if picklist is None or picklist.matches_manifest_row(row): + yield row + + def filter_rows(self, row_filter_fn): + """Create a new manifest filtered through row_filter_fn. + + This is done in memory, inserting each row one at a time. + """ + def rows_iter(): + for row in self.rows: + if row_filter_fn(row): + yield row + + return self._create_manifest_from_rows(rows_iter()) + + def filter_on_columns(self, col_filter_fn, col_names): + "Create a new manifest based on column matches." + def row_filter_fn(row): + x = [ row[col] for col in col_names if row[col] is not None ] + return col_filter_fn(x) + return self.filter_rows(row_filter_fn) + + def locations(self): + """Return all possible locations for signatures. + + CTB: this may be a (big) superset of locations, if picklists are used. + See test_sqlite_manifest_locations. + + Use set(row['internal_locations'] for row in self.rows) + if you want an exact set of locations; will be slow for big manifests + tho. + """ + c1 = self.conn.cursor() + + conditions, values, picklist = self._make_select() + if conditions: + conditions = conditions = "WHERE " + " AND ".join(conditions) + else: + conditions = "" + + c1.execute(f""" + SELECT DISTINCT internal_location FROM sourmash_sketches {conditions} + """, values) + + return ( iloc for iloc, in c1 ) + + def __contains__(self, ss): + "Check to see if signature 'ss' is in this manifest." + md5 = ss.md5sum() + + c = self.conn.cursor() + c.execute('SELECT COUNT(*) FROM sourmash_sketches WHERE md5sum=?', + (md5,)) + val, = c.fetchone() + + if bool(val): + picklist = self.picklist + return picklist is None or ss in self.picklist + return False + + @property + def picklist(self): + "Return the picklist, if any." + if self.selection_dict: + return self.selection_dict.get('picklist') + return None + + def to_picklist(self): + "Convert this manifest to a picklist." + pickset = set() + for row in self.rows: + pickset.add(row['md5']) + + picklist = SignaturePicklist('md5') + picklist.pickset = pickset + return picklist + + @classmethod + def _create_manifest_from_rows(cls, rows_iter, *, location=":memory:", + append=False): + """Create a SqliteCollectionManifest from a rows iterator. + + Internal utility function. + + CTB: should enable converting in-memory sqlite db to on-disk, + probably with sqlite3 'conn.backup(...)' function. + """ + try: + mf = cls.create(location) + except (sqlite3.OperationalError, sqlite3.DatabaseError) as exc: + if not append: + raise Exception(f"cannot create sqlite3 db at '{location}'; exception: {str(exc)}") + db = load_sqlite_index(location, request_manifest=True) + mf = db.manifest + + cursor = mf.conn.cursor() + + for row in rows_iter: + mf._insert_row(cursor, row) + + mf.conn.commit() + return mf + + +class LCA_SqliteDatabase(SqliteIndex): + """ + A wrapper class for SqliteIndex + lineage db => LCA_Database functionality. + """ + is_database = True + + def __init__(self, dbfile, *, lineage_db=None): + # CTB note: we need to let SqliteIndex open dbfile here, so can't + # just pass in a conn. + super().__init__(dbfile) + + c = self.conn.cursor() + + c.execute('SELECT DISTINCT ksize, moltype FROM sourmash_sketches') + res = list(c) + if len(res) > 1: + raise TypeError("can only have one ksize & moltype in an LCA_SqliteDatabase") + if len(res) == 0: + raise ValueError("cannot load an LCA_SqliteDatabase") + + self.ksize, self.moltype = res[0] + debug_literal(f"setting ksize and moltype to {self.ksize}, {self.moltype}") + + if lineage_db is not None: + self.lineage_db = lineage_db + + ## the below is done once, but could be implemented as something + ## ~dynamic. + self._build_index() + + @classmethod + def load(cls, filename): + "Load LCA_SqliteDatabase from a single file." + from sourmash.tax.tax_utils import LineageDB_Sqlite + + # first, load the SqliteIndex: + try: + debug_literal("sqlite_index: loading LCA_SqliteDatabase as SqliteIndex.") + obj = cls(filename) + except sqlite3.OperationalError: + raise ValueError(f"cannot open '{filename}' as a SQLite index.") + + # now, toss in the lineage DB. + lineage_db = LineageDB_Sqlite(obj.conn) + obj.lineage_db = lineage_db + obj._build_index() + + return obj + + @classmethod + def create(cls, filename, idx, lineage_db): + "Create a LCA_SqliteDatabase in a single file from existing idx/ldb." + from sourmash.tax.tax_utils import MultiLineageDB + + # first, save/create signatures... + sqlidx = SqliteIndex.create(filename) + + for ss in idx.signatures(): + sqlidx.insert(ss) + + # now, save the lineage_db into the same database + out_lineage_db = MultiLineageDB() + out_lineage_db.add(lineage_db) + out_lineage_db._save_sqlite(None, conn=sqlidx.conn) + + # and voila! return, I guess? + return cls.load(filename) + + def _build_index(self): + "Rebuild the mappings that support identifier <-> lineage." + mf = self.manifest + lineage_db = self.lineage_db + + ident_to_idx = {} + next_lid = 0 + idx_to_lid = {} + lineage_to_lid = {} + lid_to_lineage = {} + + for row in mf.rows: + name = row['name'] + if name: + # this is a bit of a hack. we try identifiers _with_ and + # _without_ versions, and take whichever works. There is + # definitely a better way to do this, but I can't think + # of one right now. + ident = name.split(' ')[0] + + lineage = lineage_db.get(ident) # try with identifier version + if lineage is None: # nope - remove version.x + ident = name.split('.')[0] + lineage = lineage_db.get(ident) + + idx = row['_id'] # this is only present in sqlite manifests. + ident_to_idx[ident] = idx + + if lineage: + lid = lineage_to_lid.get(lineage) + + # manufacture new lid? + if lid is None: + lid = next_lid + next_lid += 1 + + lineage_to_lid[lineage] = lid + lid_to_lineage[lid] = lineage + + # assign idx <-> lid + idx_to_lid[idx] = lid + + self.ident_to_idx = ident_to_idx + self.idx_to_lid = idx_to_lid + self.lid_to_lineage = lid_to_lineage + + # prevent insertions + def insert(self, *args, **kwargs): + raise NotImplementedError + + ### LCA_Database API/protocol. + + def downsample_scaled(self, scaled): + "Downsample the scaled for querying." + if scaled < self.scaled: + raise ValueError("cannot decrease scaled from {} to {}".format(self.scaled, scaled)) + + # CTB: maybe return a new LCA_Database? Right now this isn't how + # the lca_db protocol works tho. + self.scaled = scaled + + def get_lineage_assignments(self, hashval, *, min_num=None): + """ + Get a list of lineages for this hashval. + """ + x = [] + + idx_list = self.hashval_to_idx.get(hashval, []) + if min_num is None or len(idx_list) >= min_num: + for idx in idx_list: + lid = self.idx_to_lid.get(idx, None) + if lid is not None: + lineage = self.lid_to_lineage[lid] + x.append(lineage) + + return x + + @cached_property + def idx_to_ident(self): + "Map individual idx to ident." + d = defaultdict(set) + for ident, idx in self.ident_to_idx.items(): + assert idx not in d + d[idx] = ident + return d + + @property + def hashval_to_idx(self): + "Dynamically interpret the SQL 'hashes' table like it's a dict." + return _SqliteIndexHashvalToIndex(self) + + @property + def hashvals(self): + "Return all hashvals" + return iter(_SqliteIndexHashvalToIndex(self)) + + def get_identifiers_for_hashval(self, hashval): + "Return identifiers associated with this hashval" + idxlist = self.hashval_to_idx[hashval] + for idx in idxlist: + yield self.idx_to_ident[idx] + + +class _SqliteIndexHashvalToIndex: + """ + Internal wrapper class to retrieve keys and key/value pairs for + hashval -> [ list of idx ]. + """ + def __init__(self, sqlidx): + self.sqlidx = sqlidx + + def __iter__(self): + "Get all hashvals." + c = self.sqlidx.conn.cursor() + c.execute('SELECT DISTINCT hashval FROM sourmash_hashes') + for hashval, in c: + yield hashval + + def get(self, key, dv=None): + "Retrieve idxlist for a given hash." + sqlidx = self.sqlidx + c = sqlidx.cursor() + + hh = convert_hash_to(key) + + c.execute('SELECT sketch_id FROM sourmash_hashes WHERE hashval=?', + (hh,)) + + x = [ convert_hash_from(h) for h, in c ] + return x or dv + + def __getitem__(self, key): + "Retrieve idxlist for a given hash; raise KeyError if not present." + v = self.get(key) + if v is None: + raise KeyError(key) + return v diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py index 5393bfa316..ec4cf36362 100644 --- a/src/sourmash/lca/command_index.py +++ b/src/sourmash/lca/command_index.py @@ -4,6 +4,7 @@ """ import sys import csv +import os from collections import defaultdict from sourmash import sourmash_args @@ -155,6 +156,22 @@ def index(args): moltype = sourmash_args.calculate_moltype(args, default='DNA') picklist = sourmash_args.load_picklist(args) + db_outfile = args.lca_db_out + if args.database_format == 'json': + if not (db_outfile.endswith('.lca.json') or \ + db_outfile.endswith('.lca.json.gz')): # logic -> db.save + db_outfile += '.lca.json' + else: + assert args.database_format == 'sql' + if not db_outfile.endswith('.lca.sql'): + db_outfile += '.lca.sql' + + if os.path.exists(db_outfile): + error(f"ERROR: output file {db_outfile} already exists. Not overwriting.") + sys.exit(-1) + + notify(f'saving to LCA DB: {format(db_outfile)}') + notify(f'Building LCA database with ksize={args.ksize} scaled={args.scaled} moltype={moltype}.') # first, load taxonomy spreadsheet @@ -295,13 +312,7 @@ def index(args): unused_identifiers = set(assignments) - record_used_idents # now, save! - db_outfile = args.lca_db_out - if not (db_outfile.endswith('.lca.json') or \ - db_outfile.endswith('.lca.json.gz')): # logic -> db.save - db_outfile += '.lca.json' - notify(f'saving to LCA DB: {format(db_outfile)}') - - db.save(db_outfile) + db.save(db_outfile, format=args.database_format) ## done! diff --git a/src/sourmash/lca/command_rankinfo.py b/src/sourmash/lca/command_rankinfo.py index ec8aba4a16..8cd4c95a71 100644 --- a/src/sourmash/lca/command_rankinfo.py +++ b/src/sourmash/lca/command_rankinfo.py @@ -20,7 +20,7 @@ def make_lca_counts(dblist, min_num=0): assignments = defaultdict(set) for lca_db in dblist: for hashval in lca_db.hashvals: - lineages = lca_db.get_lineage_assignments(hashval, min_num) + lineages = lca_db.get_lineage_assignments(hashval, min_num=min_num) if lineages: assignments[hashval].update(lineages) diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index cda1208a60..280f810426 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -80,17 +80,32 @@ def __init__(self, ksize, scaled, moltype='DNA'): @property def location(self): + """Return source filename. + + Part of the Index protocol. + """ return self.filename def __len__(self): + """Return number of sketches. + + Part of the Index protocol. + """ return self._next_index def _invalidate_cache(self): + """Force rebuild of signatures after an 'insert'. + + Internal method. + """ if hasattr(self, '_cache'): del self._cache def _get_ident_index(self, ident, fail_on_duplicate=False): - "Get (create if nec) a unique int id, idx, for each identifier." + """Get (create if necessary) a unique int idx, for each identifier. + + Internal method. + """ idx = self._ident_to_idx.get(ident) if fail_on_duplicate: assert idx is None # should be no duplicate identities @@ -104,7 +119,11 @@ def _get_ident_index(self, ident, fail_on_duplicate=False): return idx def _get_lineage_id(self, lineage): - "Get (create if nec) a unique lineage ID for each LineagePair tuples." + """Get (create if necessary) a unique lineage ID for each + LineagePair tuples." + + Internal method of this class. + """ # does one exist already? lid = self._lineage_to_lid.get(lineage) @@ -128,6 +147,8 @@ def insert(self, sig, ident=None, lineage=None): if not specified, the signature name (sig.name) is used. 'lineage', if specified, must contain a tuple of LineagePair objects. + + Method unique to this class. """ minhash = sig.minhash @@ -179,19 +200,33 @@ def __repr__(self): return "LCA_Database('{}')".format(self.filename) def signatures(self): - "Return all of the signatures in this LCA database." + """Return all of the signatures in this LCA database. + + Part of the Index protocol. + """ from sourmash import SourmashSignature - for v in self._signatures.values(): - yield v + + if self.picklists: + pl = self.picklists + for v in self._signatures.values(): + if passes_all_picklists(v, pl): + yield v + else: + for v in self._signatures.values(): + yield v def _signatures_with_internal(self): - "Return all of the signatures in this LCA database." + """Return all of the signatures in this LCA database. + + Part of the Index protocol; used for buulding manifests. + """ + for idx, ss in self._signatures.items(): yield ss, idx def select(self, ksize=None, moltype=None, num=0, scaled=0, abund=None, containment=False, picklist=None): - """Make sure this database matches the requested requirements. + """Select a subset of signatures to search. As with SBTs, queries with higher scaled values than the database can still be used for containment search, but not for similarity @@ -223,12 +258,21 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, abund=None, @classmethod def load(cls, db_name): - "Load LCA_Database from a JSON file." + """Load LCA_Database from a JSON file. + + Method specific to this class. + """ from .lca_utils import taxlist, LineagePair if not os.path.isfile(db_name): raise ValueError(f"'{db_name}' is not a file and cannot be loaded as an LCA database") + try: + from sourmash.index.sqlite_index import LCA_SqliteDatabase + return LCA_SqliteDatabase.load(db_name) + except ValueError: + pass + xopen = open if db_name.endswith('.gz'): xopen = gzip.open @@ -322,8 +366,21 @@ def load(cls, db_name): return db - def save(self, db_name): - "Save LCA_Database to a JSON file." + def save(self, db_name, *, format='json'): + if format == 'sql': + self.save_to_sql(db_name) + else: + assert format == 'json' + self.save_to_json(db_name) + + def save_to_json(self, db_name): + """Save LCA_Database to a JSON file. + + Method specific to this class. + """ + if os.path.exists(db_name): + raise ValueError(f"LCA database {db_name} already exists; not overwriting or appending") + xopen = open if db_name.endswith('.gz'): xopen = gzip.open @@ -360,12 +417,38 @@ def save(self, db_name): json.dump(save_d, fp) + def save_to_sql(self, dbname): + "Save this LCA_Database into an LCA_SqliteDatabase" + from sourmash.index.sqlite_index import LCA_SqliteDatabase + from sourmash.tax.tax_utils import LineageDB + + if os.path.exists(dbname): + raise ValueError(f"LCA database {dbname} already exists; not overwriting or appending") + + # create a new in-memory lineage db... + assignments = {} + available_ranks = set() # track ranks, too + for ident, idx in self._ident_to_idx.items(): + lid = self._idx_to_lid.get(idx) + if lid is not None: + lineage = self._lid_to_lineage[lid] + assignments[ident] = lineage + for pair in lineage: + available_ranks.add(pair.rank) + + ldb = LineageDB(assignments, available_ranks) + + # ...and pass over to create, using 'self' as index. + LCA_SqliteDatabase.create(dbname, self, ldb) + def downsample_scaled(self, scaled): """ Downsample to the provided scaled value, i.e. eliminate all hashes that don't fall in the required range. This applies to this database in place. + + Method specific to LCA databases. """ if scaled == self.scaled: return @@ -390,8 +473,9 @@ def hashvals(self): return self._hashval_to_idx.keys() def get_lineage_assignments(self, hashval, min_num=None): - """ - Get a list of lineages for this hashval. + """Get a list of lineages for this hashval. + + Method specific to LCA Databases. """ x = [] @@ -419,7 +503,10 @@ def get_identifiers_for_hashval(self, hashval): @cached_property def _signatures(self): - "Create a _signatures member dictionary that contains {idx: sigobj}." + """Create a _signatures member dictionary that contains {idx: sigobj}. + + Internal method of this class. + """ from sourmash import MinHash, SourmashSignature is_protein = False @@ -482,6 +569,8 @@ def find(self, search_fn, query, **kwargs): As with SBTs, queries with higher scaled values than the database can still be used for containment search, but not for similarity search. See SBT.select(...) for details. + + Part of the Index protocol. """ search_fn.check_is_compatible(query) @@ -542,6 +631,10 @@ def find(self, search_fn, query, **kwargs): @cached_property def _lid_to_idx(self): + """Connect lineage id lid (int) to idx set (set of ints)."" + + Method specific to LCA databases. + """ d = defaultdict(set) for idx, lid in self._idx_to_lid.items(): d[lid].add(idx) @@ -549,6 +642,10 @@ def _lid_to_idx(self): @cached_property def _idx_to_ident(self): + """Connect idx (int) to ident (str). + + Method specific to LCA databases. + """ d = defaultdict(set) for ident, idx in self._ident_to_idx.items(): assert idx not in d diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py index 2447d94067..44a1163dae 100644 --- a/src/sourmash/manifest.py +++ b/src/sourmash/manifest.py @@ -3,7 +3,9 @@ """ import csv import ast +import os.path from abc import abstractmethod +import itertools from sourmash.picklist import SignaturePicklist @@ -26,8 +28,19 @@ class BaseCollectionManifest: 'scaled', 'n_hashes', 'with_abundance', 'name', 'filename') + @classmethod + @abstractmethod + def load_from_manifest(cls, manifest, **kwargs): + "Load this manifest from another manifest object." + @classmethod def load_from_filename(cls, filename): + # SQLite db? + db = cls.load_from_sql(filename) + if db is not None: + return db + + # not a SQLite db? with open(filename, newline="") as fp: return cls.load_from_csv(fp) @@ -67,9 +80,26 @@ def load_from_csv(cls, fp): return cls(manifest_list) - def write_to_filename(self, filename): - with open(filename, "w", newline="") as fp: - return self.write_to_csv(fp, write_header=True) + @classmethod + def load_from_sql(cls, filename): + from sourmash.index.sqlite_index import load_sqlite_index + db = load_sqlite_index(filename, request_manifest=True) + if db: + return db.manifest + + def write_to_filename(self, filename, *, database_format='csv', + ok_if_exists=False): + if database_format == 'csv': + if ok_if_exists or not os.path.exists(filename): + with open(filename, "w", newline="") as fp: + return self.write_to_csv(fp, write_header=True) + elif os.path.exists(filename) and not ok_if_exists: + raise Exception("output manifest already exists") + + elif database_format == 'sql': + from sourmash.index.sqlite_index import SqliteCollectionManifest + SqliteCollectionManifest.load_from_manifest(self, dbfile=filename, + append=ok_if_exists) @classmethod def write_csv_header(cls, fp): @@ -80,7 +110,8 @@ def write_csv_header(cls, fp): def write_to_csv(self, fp, write_header=False): "write manifest CSV to specified file handle" - w = csv.DictWriter(fp, fieldnames=self.required_keys) + w = csv.DictWriter(fp, fieldnames=self.required_keys, + extrasaction='ignore') if write_header: self.write_csv_header(fp) @@ -183,6 +214,11 @@ def __init__(self, rows): self._add_rows(rows) + @classmethod + def load_from_manifest(cls, manifest, **kwargs): + "Load this manifest from another manifest object." + return cls(manifest.rows) + def _add_rows(self, rows): self.rows.extend(rows) @@ -207,7 +243,17 @@ def __len__(self): return len(self.rows) def __eq__(self, other): - return self.rows == other.rows + "Check equality on a row-by-row basis. May fail on out-of-order rows." + for (a, b) in itertools.zip_longest(self.rows, other.rows): + if a is None or b is None: + return False + + # ignore non-required keys. + for k in self.required_keys: + if a[k] != b[k]: + return False + + return True def _select(self, *, ksize=None, moltype=None, scaled=0, num=0, containment=False, abund=None, picklist=None): diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index fa46a7d209..3dd495b6d3 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -13,7 +13,8 @@ import sourmash from sourmash.sourmash_args import FileOutput -from sourmash.logging import set_quiet, error, notify, print_results, debug +from sourmash.logging import (set_quiet, error, notify, print_results, debug, + debug_literal) from sourmash import sourmash_args from sourmash.minhash import _get_max_hash_for_scaled from sourmash.manifest import CollectionManifest @@ -301,11 +302,11 @@ def manifest(args): manifest = sourmash_args.get_manifest(loader, require=True, rebuild=rebuild) - with open(args.output, "w", newline='') as csv_fp: - manifest.write_to_csv(csv_fp, write_header=True) - + manifest.write_to_filename(args.output, + database_format=args.manifest_format, + ok_if_exists=args.force) notify(f"manifest contains {len(manifest)} signatures total.") - notify(f"wrote manifest to '{args.output}'") + notify(f"wrote manifest to '{args.output}' ({args.manifest_format})") def overlap(args): @@ -1318,7 +1319,7 @@ def check(args): else: debug("sig check: manifest required") - total_manifest_rows = [] + total_manifest_rows = CollectionManifest([]) # start loading! total_rows_examined = 0 @@ -1336,9 +1337,10 @@ def check(args): # has manifest, or ok to build (require_manifest=False) - continue! manifest = sourmash_args.get_manifest(idx, require=True) - manifest_rows = manifest._select(picklist=picklist) + manifest_rows = manifest.select_to_manifest(picklist=picklist) total_rows_examined += len(manifest) total_manifest_rows += manifest_rows + debug_literal(f"examined {len(manifest)} new rows, found {len(manifest_rows)} matching rows") notify(f"loaded {total_rows_examined} signatures.") @@ -1370,9 +1372,9 @@ def check(args): # save manifest of matching! if args.save_manifest_matching and total_manifest_rows: - mf = CollectionManifest(total_manifest_rows) - with open(args.save_manifest_matching, 'w', newline="") as fp: - mf.write_to_csv(fp, write_header=True) + mf = total_manifest_rows + mf.write_to_filename(args.save_manifest_matching, + database_format=args.manifest_format) notify(f"wrote {len(mf)} matching manifest rows to '{args.save_manifest_matching}'") elif args.save_manifest_matching: notify(f"(not saving matching manifest to '{args.save_manifest_matching}' because no matches)") diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 60cba7f43f..a004a51fc4 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -53,6 +53,7 @@ from .logging import notify, error, debug_literal from .index import (LinearIndex, ZipFileLinearIndex, MultiIndex) +from .index.sqlite_index import load_sqlite_index, SqliteIndex from . import signature as sigmod from .picklist import SignaturePicklist, PickStyle from .manifest import CollectionManifest @@ -403,6 +404,10 @@ def _load_revindex(filename, **kwargs): return db +def _load_sqlite_db(filename, **kwargs): + return load_sqlite_index(filename) + + def _load_zipfile(filename, **kwargs): "Load collection from a .zip file." db = None @@ -422,6 +427,7 @@ def _load_zipfile(filename, **kwargs): # all loader functions, in order. _loader_functions = [ ("load from stdin", _load_stdin), + ("load collection from sqlitedb", _load_sqlite_db), ("load from standalone manifest", _load_standalone_manifest), ("load from path (file or directory)", _multiindex_load_from_path), ("load from file list", _multiindex_load_from_pathlist), @@ -765,7 +771,6 @@ def get_manifest(idx, *, require=True, rebuild=False): """ Retrieve a manifest for this idx, loaded with `load_file_as_index`. - If a manifest exists and `rebuild` is False, return the manifest. Even if a manifest exists and `rebuild` is True, rebuild the manifest. If a manifest does not exist or `rebuild` is True, try to build one. If a manifest cannot be built and `require` is True, error exit. @@ -786,7 +791,7 @@ def get_manifest(idx, *, require=True, rebuild=False): # need to build one... try: - debug_literal("get_manifest: rebuilding manifest") + notify("Generating a manifest...") m = CollectionManifest.create_manifest(idx._signatures_with_internal(), include_signature=False) debug_literal("get_manifest: rebuilt manifest.") @@ -899,6 +904,36 @@ def add(self, ss): sigmod.save_signatures([ss], fp, compression=1) +class SaveSignatures_SqliteIndex(_BaseSaveSignaturesToLocation): + "Save signatures within a directory, using md5sum names." + def __init__(self, location): + super().__init__(location) + self.location = location + self.idx = None + self.cursor = None + + def __repr__(self): + return f"SaveSignatures_SqliteIndex('{self.location}')" + + def close(self): + self.idx.commit() + self.cursor.execute('VACUUM') + self.idx.close() + + def open(self): + self.idx = SqliteIndex.create(self.location, append=True) + self.cursor = self.idx.cursor() + + def add(self, add_sig): + for ss in _get_signatures_from_rust([add_sig]): + super().add(ss) + self.idx.insert(ss, cursor=self.cursor, commit=False) + + # commit every 1000 signatures. + if self.count % 1000 == 0: + self.idx.commit() + + class SaveSignatures_SigFile(_BaseSaveSignaturesToLocation): "Save signatures to a .sig JSON file." def __init__(self, location): @@ -1014,18 +1049,20 @@ def add(self, add_sig): class SigFileSaveType(Enum): + NO_OUTPUT = 0 SIGFILE = 1 SIGFILE_GZ = 2 DIRECTORY = 3 ZIPFILE = 4 - NO_OUTPUT = 5 + SQLITEDB = 5 _save_classes = { + SigFileSaveType.NO_OUTPUT: SaveSignatures_NoOutput, SigFileSaveType.SIGFILE: SaveSignatures_SigFile, SigFileSaveType.SIGFILE_GZ: SaveSignatures_SigFile, SigFileSaveType.DIRECTORY: SaveSignatures_Directory, SigFileSaveType.ZIPFILE: SaveSignatures_ZipFile, - SigFileSaveType.NO_OUTPUT: SaveSignatures_NoOutput + SigFileSaveType.SQLITEDB: SaveSignatures_SqliteIndex, } @@ -1042,6 +1079,8 @@ def SaveSignaturesToLocation(filename, *, force_type=None): save_type = SigFileSaveType.SIGFILE_GZ elif filename.endswith('.zip'): save_type = SigFileSaveType.ZIPFILE + elif filename.endswith('.sqldb'): + save_type = SigFileSaveType.SQLITEDB else: # default to SIGFILE intentionally! save_type = SigFileSaveType.SIGFILE diff --git a/src/sourmash/sqlite_utils.py b/src/sourmash/sqlite_utils.py new file mode 100644 index 0000000000..2b7503a2d8 --- /dev/null +++ b/src/sourmash/sqlite_utils.py @@ -0,0 +1,79 @@ +""" +Common utility functions for handling sqlite3 databases. +""" +import os +import sqlite3 +from .logging import debug_literal + + +def open_sqlite_db(filename): + """ + Is this a pre-existing sqlite3 database? Return connection object if so. + + Otherwise, return None. + """ + debug_literal("open_sqlite_db: started") + # does it already exist/is it non-zero size? + + # note: sqlite3.connect creates the file if it doesn't exist, which + # we don't want in this function. + if not os.path.exists(filename) or os.path.getsize(filename) == 0: + debug_literal("open_sqlite_db: no file/zero sized file") + return None + + # can we connect to it? + try: + conn = sqlite3.connect(filename) + except (sqlite3.OperationalError, sqlite3.DatabaseError): + debug_literal("open_sqlite_db: cannot connect.") + return None + + # check for the 'sourmash_internal' table. + cursor = conn.cursor() + try: + cursor.execute('SELECT DISTINCT key, value FROM sourmash_internal') + except (sqlite3.OperationalError, sqlite3.DatabaseError): + debug_literal("open_sqlite_db: cannot read sourmash_internal.") + + # is this a taxonomy DB? + try: + cursor.execute('SELECT * FROM taxonomy LIMIT 1') + except (sqlite3.OperationalError, sqlite3.DatabaseError): + debug_literal("open_sqlite_db: cannot read 'taxonomy', either.") + return None + + return conn + + +def add_sourmash_internal(cursor, use_type, version): + """ + Add use_type/version to sourmash_internal table. + """ + cursor.execute(""" + CREATE TABLE IF NOT EXISTS sourmash_internal ( + key TEXT UNIQUE, + value TEXT + ) + """) + + d = get_sourmash_internal(cursor) + + val = d.get(use_type) + if val is not None: + # do version compatibility foo here? + if version != val: + raise Exception(f"sqlite problem: for {use_type}, want version {version}, got version {val}") + else: + cursor.execute(""" + INSERT INTO sourmash_internal (key, value) VALUES (?, ?) + """, (use_type, version)) + + +def get_sourmash_internal(cursor): + """ + Retrieve a key/value dictionary from sourmash_internal. + """ + cursor.execute('SELECT DISTINCT key, value FROM sourmash_internal') + d = dict(cursor) + + return d diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 4d9bac4965..d5a7161afc 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -6,6 +6,12 @@ from collections import namedtuple, defaultdict from collections import abc +from sourmash import sqlite_utils +from sourmash.exceptions import IndexNotSupported + +import sqlite3 + + __all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs', 'load_gather_results', 'check_and_load_gather_csvs', 'find_match_lineage', 'summarize_gather_at', @@ -228,7 +234,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], # summarize at rank! lineage = pop_to_rank(lineage, rank) - assert lineage[-1].rank == rank, lineage[-1] + assert lineage[-1].rank == rank, (rank, lineage[-1]) # record info sum_uniq_to_query[query_name][lineage] += f_unique_to_query sum_uniq_weighted[query_name][lineage] += f_uniq_weighted @@ -622,15 +628,27 @@ def load(cls, filename, *, delimiter=',', force=False, class LineageDB_Sqlite(abc.Mapping): """ - A LineageDB based on a sqlite3 database with a 'taxonomy' table. + A LineageDB based on a sqlite3 database with a 'sourmash_taxonomy' table. """ # NOTE: 'order' is a reserved name in sql, so we have to use 'order_'. columns = ('superkingdom', 'phylum', 'order_', 'class', 'family', 'genus', 'species', 'strain') + table_name = 'sourmash_taxonomy' - def __init__(self, conn): + def __init__(self, conn, *, table_name=None): self.conn = conn + # provide for legacy support for pre-sourmash_internal days... + if table_name is not None: + self.table_name = table_name + + # check that the right table is there. + c = conn.cursor() + try: + c.execute(f'SELECT * FROM {self.table_name} LIMIT 1') + except (sqlite3.DatabaseError, sqlite3.OperationalError): + raise ValueError("not a taxonomy database") + # check: can we do a 'select' on the right table? self.__len__() c = conn.cursor() @@ -638,7 +656,7 @@ def __init__(self, conn): # get available ranks... ranks = set() for column, rank in zip(self.columns, taxlist(include_strain=True)): - query = f'SELECT COUNT({column}) FROM taxonomy WHERE {column} IS NOT NULL AND {column} != ""' + query = f'SELECT COUNT({column}) FROM {self.table_name} WHERE {column} IS NOT NULL AND {column} != ""' c.execute(query) cnt, = c.fetchone() if cnt: @@ -649,14 +667,35 @@ def __init__(self, conn): @classmethod def load(cls, location): - "load taxonomy information from a sqlite3 database" - import sqlite3 + "load taxonomy information from an existing sqlite3 database" + conn = sqlite_utils.open_sqlite_db(location) + if not conn: + raise ValueError("not a sqlite taxonomy database") + + table_name = None + c = conn.cursor() try: - conn = sqlite3.connect(location) - db = cls(conn) - except sqlite3.DatabaseError: - raise ValueError("not a sqlite database") - return db + info = sqlite_utils.get_sourmash_internal(c) + except sqlite3.OperationalError: + info = {} + + if 'SqliteLineage' in info: + if info['SqliteLineage'] != '1.0': + raise IndexNotSupported + + table_name = 'sourmash_taxonomy' + else: + # legacy support for old taxonomy DB, pre sourmash_internal. + try: + c.execute('SELECT * FROM taxonomy LIMIT 1') + table_name = 'taxonomy' + except sqlite3.OperationalError: + pass + + if table_name is None: + raise ValueError("not a sqlite taxonomy database") + + return cls(conn, table_name=table_name) def _make_tup(self, row): "build a tuple of LineagePairs for this sqlite row" @@ -666,7 +705,7 @@ def _make_tup(self, row): def __getitem__(self, ident): "Retrieve lineage for identifer" c = self.cursor - c.execute('SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (ident,)) + c.execute(f'SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name} WHERE ident=?', (ident,)) # retrieve names list... names = c.fetchone() @@ -687,7 +726,7 @@ def __bool__(self): def __len__(self): "Return number of rows" c = self.conn.cursor() - c.execute('SELECT COUNT(DISTINCT ident) FROM taxonomy') + c.execute(f'SELECT COUNT(DISTINCT ident) FROM {self.table_name}') nrows, = c.fetchone() return nrows @@ -695,7 +734,7 @@ def __iter__(self): "Return all identifiers" # create new cursor so as to allow other operations c = self.conn.cursor() - c.execute('SELECT DISTINCT ident FROM taxonomy') + c.execute(f'SELECT DISTINCT ident FROM {self.table_name}') for ident, in c: yield ident @@ -704,11 +743,12 @@ def items(self): "return all items in the sqlite database" c = self.conn.cursor() - c.execute('SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy') + c.execute(f'SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name}') for ident, *names in c: yield ident, self._make_tup(names) + class MultiLineageDB(abc.Mapping): "A wrapper for (dynamically) combining multiple lineage databases." @@ -805,15 +845,23 @@ def save(self, filename_or_fp, file_format): if is_filename: fp.close() - def _save_sqlite(self, filename): - import sqlite3 - db = sqlite3.connect(filename) + def _save_sqlite(self, filename, *, conn=None): + from sourmash import sqlite_utils + + if conn is None: + db = sqlite3.connect(filename) + else: + assert not filename + db = conn cursor = db.cursor() try: + sqlite_utils.add_sourmash_internal(cursor, 'SqliteLineage', '1.0') + + # CTB: could add 'IF NOT EXIST' here; would need tests, too. cursor.execute(""" - CREATE TABLE taxonomy ( + CREATE TABLE sourmash_taxonomy ( ident TEXT NOT NULL, superkingdom TEXT, phylum TEXT, @@ -831,7 +879,7 @@ class TEXT, raise ValueError(f"taxonomy table already exists in '{filename}'") # follow up and create index - cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);") + cursor.execute("CREATE UNIQUE INDEX sourmash_taxonomy_ident ON sourmash_taxonomy(ident);") for ident, tax in self.items(): x = [ident, *[ t.name for t in tax ]] @@ -840,7 +888,7 @@ class TEXT, while len(x) < 9: x.append('') - cursor.execute('INSERT INTO taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x) + cursor.execute('INSERT INTO sourmash_taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x) db.commit() diff --git a/tests/conftest.py b/tests/conftest.py index a592a1c114..51cdd81a12 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -59,11 +59,17 @@ def linear_gather(request): def prefetch_gather(request): return request.param + @pytest.fixture(params=[True, False]) def use_manifest(request): return request.param +@pytest.fixture(params=['json', 'sql']) +def lca_db_format(request): + return request.param + + # --- BEGIN - Only run tests using a particular fixture --- # # Cribbed from: http://pythontesting.net/framework/pytest/pytest-run-tests-using-particular-fixture/ def pytest_collection_modifyitems(items, config): diff --git a/tests/test-data/lca/TARA_ANW_MAG_00005.sig b/tests/test-data/lca/TARA_ANW_MAG_00005.sig new file mode 100644 index 0000000000..d859cc45f7 --- /dev/null +++ b/tests/test-data/lca/TARA_ANW_MAG_00005.sig @@ -0,0 +1,727 @@ +[ + { + "class": "sourmash_signature", + "email": "", + "filename": "TARA_ANW_MAG_00005.fa.gz", + "hash_function": "0.murmur64", + "name": "TARA_ANW_MAG_00005", + "signatures": [ + { + "ksize": 21, + "max_hash": 1844674407370955, + "md5sum": "4869f8fd77731e394c97308236ef53ba", + "mins": [ + 7261499640022, + 17022688539117, + 24780755763437, + 26094062797448, + 37375887029018, + 45726215050976, + 67947402991923, + 70327100175691, + 73631406345485, + 75687893355486, + 105135425934627, + 105274115963054, + 109871703932054, + 138220508528468, + 156844125182123, + 163500818310134, + 163662417847487, + 176892283304016, + 199135490729275, + 201167067220997, + 206837419773784, + 209330443819944, + 212258628194397, + 215291313053192, + 227179770145331, + 232573574424899, + 236654478919192, + 241257270955000, + 251750070415640, + 254725617997120, + 260375367933856, + 271622583135593, + 277315933136573, + 281544839547313, + 293383323070263, + 299823439863555, + 304524402700845, + 313091075946723, + 322547749912078, + 324954619227929, + 339233563669806, + 354830119066741, + 379491551636935, + 381041760937822, + 385692763394602, + 388245973749043, + 393105275665373, + 404961751537071, + 410646122097066, + 443952051764237, + 446573579198579, + 455751180059053, + 455849023177433, + 464377859899231, + 480151994650880, + 482173200268375, + 494426325671056, + 496485801898964, + 499100180067828, + 500217412914199, + 510317957315879, + 512781920587711, + 519602171835768, + 527445477574409, + 539623224762798, + 551979853417783, + 558937606910908, + 559147097864119, + 563686843032410, + 572262671878160, + 572933399733742, + 582833254610830, + 610743369458011, + 610979072963727, + 627233310987889, + 634242600060310, + 637737765634689, + 644693698396599, + 645789433201504, + 647303140882444, + 655342839954874, + 659916281112761, + 665799220375952, + 666604147522311, + 674648371775796, + 682469416885882, + 691207119387676, + 692309787227983, + 701046872481506, + 705635514785796, + 712392935591389, + 715825705926668, + 723721368488719, + 735483859106928, + 750019233304834, + 795803454454480, + 797801163316046, + 817539581068493, + 823714523111649, + 828868904212981, + 829149214494779, + 841331208856587, + 843448710967822, + 843570874513558, + 851047127466851, + 880184748043904, + 885106443137060, + 891079117673150, + 893699261563541, + 894265342142509, + 894359432603692, + 899969039734078, + 906277352238206, + 910237395134432, + 926628766471405, + 938555198202282, + 939878538339783, + 941325832724695, + 945067665224496, + 955753620001282, + 963285612497504, + 973840465645911, + 974598447754644, + 976538178114630, + 1000480444477003, + 1000891863764431, + 1006636265411357, + 1015780423231175, + 1027110153373913, + 1027781495529628, + 1029358875843838, + 1035682915448197, + 1040800579640246, + 1041767829353040, + 1046606731921185, + 1047272987104794, + 1051422429911186, + 1053787813708170, + 1057985735911604, + 1069554451838665, + 1069767125970764, + 1069963911327886, + 1082781035208096, + 1085087750974918, + 1096029135539154, + 1097334635570103, + 1110082710835370, + 1110978970201997, + 1124169773515246, + 1135598964276263, + 1139684720640609, + 1153936392816523, + 1157602739081054, + 1159063477570599, + 1181401217374597, + 1182718356752693, + 1193959678292417, + 1201898193124523, + 1205671887893525, + 1210442343213638, + 1216902405821085, + 1223411613615937, + 1230689994892761, + 1249109538741833, + 1251192229963588, + 1270863556996318, + 1271929096987634, + 1272414160447604, + 1277214110748637, + 1277513755537277, + 1284384561550957, + 1286911165539939, + 1288689074612688, + 1296477416153844, + 1298227293695131, + 1298606618964048, + 1306046498097540, + 1306085630255409, + 1307062292190132, + 1314694397354156, + 1315563949283672, + 1317303969578154, + 1333149524347325, + 1354583571879519, + 1356191628776297, + 1357139820587419, + 1358967596782834, + 1367702836962704, + 1376066629062225, + 1377198811584963, + 1396140086039349, + 1407540073714301, + 1416195510596512, + 1420646987991559, + 1427432464819677, + 1430310371960086, + 1442622830656700, + 1449246931138848, + 1451231201999347, + 1453942482622285, + 1458064475606220, + 1459062709645729, + 1463874510680880, + 1469762310223581, + 1471902842254525, + 1486175849324817, + 1489602502204481, + 1491078763648688, + 1492399376187328, + 1496318049992711, + 1507073974958147, + 1510612848092577, + 1513381488441236, + 1534455809393163, + 1538653330058790, + 1545646030677570, + 1552867471529641, + 1570136057034454, + 1576651925401966, + 1577309859380825, + 1580083124831154, + 1607494478468523, + 1610296318733341, + 1620898530293314, + 1623476921244704, + 1627138231728305, + 1648483189456971, + 1652419439072205, + 1656287422487530, + 1672514725054986, + 1676458762042511, + 1682399253692259, + 1691577763605540, + 1697754811186722, + 1699535359627627, + 1700685618813670, + 1709012328262269, + 1723559970655079, + 1731733075749857, + 1735191746377532, + 1736229881750752, + 1738221912994299, + 1743160386558457, + 1744244795454307, + 1748180345282466, + 1748348470930756, + 1751106855999370, + 1758843559742057, + 1763271561451064, + 1764635978230060, + 1770936882947726, + 1778277096336167, + 1779407779292240, + 1796022957337456, + 1817365164110605, + 1831917296885812, + 1836261826113111, + 1842914377206390 + ], + "molecule": "DNA", + "num": 4294967295, + "seed": 42 + }, + { + "ksize": 31, + "max_hash": 1844674407370955, + "md5sum": "32b685f9a5c92ff04d3e5a71e2b819a6", + "mins": [ + 4544564541777, + 8581313229021, + 13858004909890, + 22109472800306, + 33243310918093, + 35046909730857, + 36002943219015, + 36803243944975, + 40925794788684, + 50300561984719, + 54925088761051, + 65148730313359, + 67973641676064, + 76325417666516, + 92860832822232, + 104860744849022, + 111871575504049, + 125329050929634, + 129188889144749, + 131536884986771, + 132973903820267, + 151766414818154, + 155289997869327, + 168141435154447, + 177548054693036, + 181315672994049, + 183757115294821, + 194222120718333, + 213139491415980, + 218773877233050, + 240860931922136, + 248846621773679, + 260236619960314, + 262780918637537, + 265790317618837, + 272636420861010, + 274046819588001, + 277912619386690, + 286605637178530, + 292973459027025, + 318322734402250, + 321838426241799, + 328850724708193, + 346478169352874, + 348380083210079, + 354246879167003, + 359178493513027, + 360239247174006, + 371785010691833, + 379094350614852, + 381077258936088, + 381442111648337, + 388577667001338, + 392358579933525, + 400440609219021, + 406038583921057, + 408330197378941, + 410163772670652, + 423234050937512, + 425218685176027, + 446090115298591, + 446453155073485, + 455236569828737, + 458856926805669, + 462486830307725, + 462595004203208, + 483166107352903, + 483704898247277, + 494244104678300, + 494639859417056, + 501192188637884, + 510675622259470, + 518201549483575, + 522168448102877, + 547850350038011, + 548618821296654, + 566297202288227, + 571007990707951, + 577781161449752, + 583275969589027, + 594912311991845, + 601236609369372, + 602107849542274, + 608271077234669, + 632062004417280, + 632109760592796, + 634727833351551, + 637278824563158, + 644971206181512, + 648533249063278, + 649287581711486, + 651968974610436, + 655856515200674, + 679849320365200, + 681103172177451, + 687704345982478, + 695282473938552, + 698348460889683, + 705162594752041, + 707306922556810, + 711072917501582, + 714754050038510, + 730705098468235, + 744079072205921, + 745952201173799, + 747016626005998, + 757542835394800, + 766146707775082, + 773178920995688, + 782994228389590, + 803727313826863, + 812750392739441, + 821574537203519, + 845836091437649, + 853742331786980, + 864887941091323, + 869671685914354, + 888435024190881, + 947456280478229, + 949367246267583, + 956491542870449, + 967048970295402, + 971202060766007, + 978931950435734, + 993056624873370, + 996975645223807, + 1007707215648390, + 1008149686779084, + 1018985563127910, + 1022692054265472, + 1027179518608476, + 1036606260474886, + 1043850921081072, + 1049874321329540, + 1075834968301467, + 1087755718865650, + 1088290622982785, + 1088308856754065, + 1097255462428957, + 1102494175504539, + 1106200855941023, + 1163422042717229, + 1199776215756672, + 1203944597053244, + 1211551163495496, + 1216224755819991, + 1216971004976208, + 1246866724202403, + 1262726525591376, + 1271821592733966, + 1295576174349907, + 1298848536168936, + 1310656662269501, + 1326048486469817, + 1327897694537274, + 1339040494856037, + 1343187481578618, + 1349479868328803, + 1351034322121175, + 1360641268612126, + 1383827641790386, + 1383888566098538, + 1399068747946805, + 1405006380107785, + 1414729661325784, + 1414734978678493, + 1420322178649275, + 1420740489306321, + 1439813556506380, + 1446126062725480, + 1463513307502000, + 1463964185955659, + 1466749058711287, + 1472274362269533, + 1472753635123188, + 1473369856693004, + 1479361489796687, + 1482158842043339, + 1485775606140555, + 1499165253592503, + 1502229061684166, + 1503510407494678, + 1505420843877866, + 1519588803781842, + 1568212279715817, + 1573568323455958, + 1574674975536634, + 1578938126273913, + 1579338155998981, + 1590684992147198, + 1597887774957974, + 1615302073601615, + 1617303401995043, + 1635926261617086, + 1637649705314274, + 1643056357828058, + 1648761532214959, + 1652982371280533, + 1654211925340972, + 1663734887887719, + 1665001711886066, + 1689681978632597, + 1696936316661449, + 1708928874057963, + 1722636935798965, + 1729879039789897, + 1730265273177045, + 1733781125039495, + 1737858152932701, + 1740860866518638, + 1746967568522022, + 1756717989991904, + 1759759973007296, + 1773532575823108, + 1782809577016768, + 1789129435832413, + 1795223687727048, + 1806483358546057, + 1806587563187986, + 1808755229329691, + 1818981796527103, + 1825012166950925, + 1829615133297992 + ], + "molecule": "DNA", + "num": 4294967295, + "seed": 42 + }, + { + "ksize": 51, + "max_hash": 1844674407370955, + "md5sum": "bba2d1d1300be544b90d4aadafae32f1", + "mins": [ + 9945101229624, + 46247985928663, + 54549389196089, + 76819130364906, + 88079115519765, + 90403765835697, + 90728653842211, + 91793881436840, + 94834711116805, + 119847841749950, + 119957755440993, + 138435291626912, + 149234087571659, + 150344534966839, + 154871205908281, + 160246061469139, + 165296793264417, + 176505077931801, + 177475993690446, + 195136241340824, + 198126553779599, + 213232222769451, + 215501276863337, + 215540464896491, + 241330341374756, + 244769608524648, + 252142941757664, + 276550930094750, + 282618206741836, + 301371296153703, + 305309954920570, + 339509083230992, + 350142354365511, + 355594636141267, + 361534239440081, + 374310413633401, + 378467068012005, + 378561877946670, + 378774522785655, + 391959695661797, + 395185026373394, + 413509418028148, + 436335056083665, + 444003188238308, + 452456316931781, + 470812297949332, + 489965934181414, + 490503459046167, + 498421264705664, + 508170103481209, + 510762917685834, + 515220336706193, + 523866171723467, + 537917614206757, + 544478384905872, + 545412682597246, + 558071594433189, + 569149585492933, + 577170224766862, + 617367446516686, + 624596817632791, + 626482053839411, + 627721032112159, + 630680332331388, + 632537462650991, + 635533295427126, + 647757051000115, + 662866697363867, + 675505594365095, + 682437654404743, + 695734598424378, + 707400890074172, + 714316338411466, + 718153160045569, + 725102976612073, + 726575575467779, + 728372447970513, + 740410965234874, + 741656025825068, + 746599689567078, + 748390654459078, + 757635051023441, + 758166256157331, + 789243880255584, + 791148943006182, + 801095000521623, + 832847865995258, + 860399304487865, + 881294351880349, + 890847340518834, + 894898066586874, + 897917085118712, + 899165193706818, + 903850995491923, + 905164537161159, + 905864901556721, + 946633982106275, + 952990090209151, + 963203334828915, + 966819043771784, + 969971834372201, + 970233153403479, + 973995346671367, + 977477764556730, + 980746776478350, + 1004797079756520, + 1006328830052067, + 1051098362406450, + 1052360301154198, + 1060380075133980, + 1061890704417468, + 1071314112022724, + 1072673432279368, + 1082568720981364, + 1087830041195302, + 1090942563769112, + 1093128988316444, + 1095581023497886, + 1106345633154217, + 1110676749840599, + 1127514406891265, + 1128500159929587, + 1138284377264029, + 1143890328754304, + 1144620351493416, + 1166493475885943, + 1169520306083838, + 1192839686284246, + 1205296513579035, + 1207049757360820, + 1217156179909838, + 1219668734522480, + 1242257289107445, + 1255870300486127, + 1260373197397089, + 1275980315154201, + 1287819253321456, + 1307645721227691, + 1313514333282255, + 1315398559960369, + 1323401307902219, + 1338807497736601, + 1345189582040158, + 1353551640680240, + 1361160979936763, + 1364045380953529, + 1373790677075103, + 1378284377398456, + 1402949164058266, + 1411207819443473, + 1414729246029520, + 1414772044138756, + 1441807154110607, + 1442464388167919, + 1446817139986549, + 1447638619329225, + 1456981908420248, + 1471921223421648, + 1478104292616684, + 1491503227667013, + 1494865009683892, + 1495418386991481, + 1497557532387931, + 1501350490752912, + 1506945353480777, + 1525676266116689, + 1527023747356139, + 1529696683224220, + 1555216430887872, + 1563070810453108, + 1563360269911280, + 1569674263826199, + 1579909907362041, + 1605355045136346, + 1615177077944557, + 1628443217654855, + 1634394111160905, + 1637919048744925, + 1641378210621886, + 1644323079960433, + 1645313782077105, + 1648240779644969, + 1648667343256600, + 1652224427058465, + 1656589272362026, + 1684118171988047, + 1703817671055877, + 1727808733759629, + 1735226051311727, + 1740853963999019, + 1754872609648351, + 1764021135315600, + 1765055162361848, + 1771357182711174, + 1773394602563094, + 1778045463227508, + 1789909160154131, + 1801705834605266, + 1823838398335508, + 1827392188411080, + 1836473457916331, + 1836603169748121, + 1842874979846210 + ], + "molecule": "DNA", + "num": 4294967295, + "seed": 42 + } + ], + "type": "mrnaseq", + "version": 0.4 + } +] \ No newline at end of file diff --git a/tests/test-data/lca/TARA_ASE_MAG_00007.sig b/tests/test-data/lca/TARA_ASE_MAG_00007.sig new file mode 100644 index 0000000000..ce256a3e5b --- /dev/null +++ b/tests/test-data/lca/TARA_ASE_MAG_00007.sig @@ -0,0 +1,547 @@ +[ + { + "class": "sourmash_signature", + "email": "", + "filename": "TARA_ASE_MAG_00007.fa.gz", + "hash_function": "0.murmur64", + "name": "TARA_ASE_MAG_00007", + "signatures": [ + { + "ksize": 21, + "max_hash": 1844674407370955, + "md5sum": "cb7b244bc0e995792a11b66c16ff7326", + "mins": [ + 1320310933848, + 2892413254452, + 15213704816644, + 33303080542536, + 54133931834575, + 62637974235196, + 73477801834385, + 97062588236658, + 123773029365536, + 137362439086635, + 144823221255214, + 156752128234991, + 164057670737197, + 171712051010595, + 182665537107203, + 186020241451047, + 188186590099779, + 188838466536613, + 194997125034804, + 213251121966702, + 225023655196506, + 261479580480913, + 272026984650472, + 279756677202662, + 314355751041783, + 334170383956019, + 342634816971008, + 349914351085019, + 379182580303360, + 381729062503240, + 388776072441517, + 402906003856268, + 406210469872200, + 409809743078042, + 460702276810632, + 466900009927013, + 476591670784726, + 479104980728793, + 479177204470606, + 501230591082143, + 513924950317396, + 517715885923524, + 527819610502727, + 543326239092256, + 577479777570666, + 586068314270801, + 597928637040879, + 617125356577203, + 617533038767006, + 636282211835547, + 667104484112904, + 668576287207946, + 677204546865933, + 680164730127857, + 720596965972113, + 729433410425559, + 745196773431240, + 757774697729009, + 768840317595611, + 770136513945682, + 782785511009571, + 793596289997450, + 796577523630424, + 798433421887896, + 845938949114618, + 851286087627374, + 858504228997115, + 881067751355174, + 891402467970305, + 904011914047572, + 913222451749990, + 914088491856141, + 923921990467907, + 936268759660907, + 959436250013717, + 973965338808796, + 980258323547582, + 1031340326504494, + 1047144021507888, + 1064605520125331, + 1065439721360504, + 1074736673607863, + 1077196565012501, + 1086881600664146, + 1104484446044355, + 1105521864299602, + 1107952839323628, + 1139991992217537, + 1145511749123640, + 1156404070085198, + 1163505903251654, + 1165188978580566, + 1166367705497716, + 1213677595655023, + 1220655070177313, + 1280256658687935, + 1280332700106146, + 1287486058316823, + 1292589574221309, + 1317512056543188, + 1327701823350156, + 1328746743195849, + 1336689255973730, + 1337687585058361, + 1340522866763452, + 1343727559872570, + 1345815115250482, + 1349455026997328, + 1364253926273049, + 1369215875180020, + 1370623984538842, + 1372207466474251, + 1377199785245686, + 1378341296517922, + 1385567931158094, + 1422023516698097, + 1423086098527647, + 1460093314304867, + 1467092863571523, + 1468886259972494, + 1505908035366497, + 1527945363183526, + 1528015454855597, + 1530020216407443, + 1536336673508306, + 1540701781788395, + 1544222075541055, + 1546955615457752, + 1564816125719807, + 1569008527089287, + 1575851513527578, + 1584190737700516, + 1595651307258555, + 1595677562213639, + 1596985703401331, + 1601553698420037, + 1670232723950711, + 1708023052201102, + 1710607948167940, + 1718816135885184, + 1727386244281103, + 1732179872420038, + 1737265230552485, + 1739874960586106, + 1743790829382847, + 1751595833712019, + 1765640390860909, + 1781245879675999, + 1788125073323888, + 1806421837574930, + 1822827682426438, + 1835165435181191, + 1839026611330731, + 1840815351462327 + ], + "molecule": "DNA", + "num": 4294967295, + "seed": 42 + }, + { + "ksize": 31, + "max_hash": 1844674407370955, + "md5sum": "c54c70afe8d021a2f8c1aa2cd0bb4bb6", + "mins": [ + 4326955763283, + 7158799582087, + 23738015571950, + 24224314530175, + 42325549179236, + 64509359584961, + 65062716770777, + 83398355305453, + 92747420401797, + 124035655424003, + 127313944559960, + 132116747543577, + 133330625460294, + 144280372722709, + 144384767973149, + 145603209243740, + 150445332721306, + 156763944975614, + 161038167145035, + 163710911470311, + 188530049183223, + 194230965715185, + 205331160625393, + 205672253215890, + 211165710707279, + 224650264198442, + 230703845052601, + 235023183594108, + 241807515829080, + 255101064467816, + 279376953099972, + 298541111621688, + 299928161543665, + 307236951551041, + 327780792286603, + 353390289593063, + 361470005717781, + 372507579642397, + 392843236904911, + 396649849298484, + 406050957932272, + 408472664407451, + 430477161830012, + 434648552401784, + 439292176300010, + 445836063143558, + 446791063545911, + 476004025959057, + 490569823544854, + 491113374084738, + 507432650225682, + 513532811012889, + 514206374296874, + 522617232301446, + 531634333064453, + 601615954910708, + 602337027915933, + 602812963350963, + 623035558717821, + 625586030923293, + 627451805812462, + 641611252315184, + 643528692945958, + 645876596829776, + 658835579608551, + 677764652048342, + 682543142973118, + 684668052450211, + 685495428308727, + 686695886754969, + 692476769447259, + 704615752139438, + 740069424767386, + 745130020812751, + 756815101716293, + 763345401324164, + 772793860002582, + 779858178416795, + 780860648830821, + 783647995529193, + 785569928910679, + 800181195992144, + 820976270639292, + 830506996571544, + 835567606030461, + 836982323489185, + 860210700371585, + 870815004132635, + 876888038506266, + 884456650670029, + 893915017062331, + 902358906593190, + 937130179358141, + 947462708051771, + 958549480584791, + 991786633438918, + 1003706164087139, + 1017432414969433, + 1060964007653270, + 1075520057734223, + 1097036351240365, + 1100485334805616, + 1102709029284409, + 1108109366559790, + 1132533238161930, + 1138882566797847, + 1140334908428827, + 1140978689328860, + 1141358484754370, + 1147856635337766, + 1149859754057319, + 1156108623222019, + 1158332938137288, + 1163982198761717, + 1168856596409714, + 1188879900225256, + 1194994639145223, + 1198504034991418, + 1207705717654311, + 1217704507818923, + 1243967225052122, + 1245816885076600, + 1250394771314603, + 1275277584936870, + 1282805064459123, + 1297142062646069, + 1300734114223911, + 1304579544156546, + 1325105945243045, + 1325880536195852, + 1331620138243907, + 1340142053189959, + 1341056039384211, + 1346153539817428, + 1350573964498633, + 1352179949520117, + 1370236218149841, + 1396299747291170, + 1396737983856138, + 1402153580733857, + 1403507365256538, + 1407570060776159, + 1418672243187610, + 1442450169026342, + 1442667040617666, + 1464341697678878, + 1465570493913423, + 1487555602629223, + 1489138355617241, + 1490108744092588, + 1492210873197668, + 1500840557484266, + 1523291285422917, + 1556328723480323, + 1574673368458998, + 1582216342147004, + 1584562361477530, + 1584580201342206, + 1613756014752074, + 1620038983317120, + 1626079573118911, + 1644134626794663, + 1654097985563011, + 1665537417691967, + 1670459258658688, + 1670542546001691, + 1672801158400120, + 1674162523451589, + 1687505585245005, + 1688966133175948, + 1704469571428976, + 1715363742519159, + 1726660028167080, + 1728892087170449, + 1770175378346274, + 1778352428854204, + 1807686194229722, + 1807975427341585, + 1817145314970484, + 1824763509285723 + ], + "molecule": "DNA", + "num": 4294967295, + "seed": 42 + }, + { + "ksize": 51, + "max_hash": 1844674407370955, + "md5sum": "32aa621bee8929ac9560d6565bcfb862", + "mins": [ + 19411759939315, + 19840679255521, + 29031664402932, + 45785195999855, + 48690512646604, + 71674775483449, + 75370332699226, + 88405855739208, + 88459306363425, + 91927271358308, + 122195605266193, + 122374588487548, + 122439295505808, + 124114576015424, + 143281575899339, + 164536118500996, + 174948481712911, + 197592037400753, + 199681205739142, + 207027103039423, + 230325100062892, + 230414837839661, + 236014655428808, + 251912097380427, + 267630627834446, + 277675546479362, + 279647193782083, + 279895769780699, + 302246695262151, + 305711110486640, + 306380445338900, + 306817076837397, + 307146961878210, + 335172906261746, + 347379664864332, + 349570947446321, + 355423863000320, + 371938160670904, + 378569055722803, + 379790789459661, + 400330384590588, + 410147321680783, + 417351312784043, + 424376531696268, + 433979163602316, + 438895668052137, + 445590816830794, + 448840730316285, + 460982894091777, + 482261883145101, + 487876204544964, + 517235307599436, + 518076611402125, + 544807655567270, + 545509020619884, + 570384877175481, + 575475396040697, + 586820075650446, + 591296476318629, + 593119817248178, + 602623078842436, + 607558888705879, + 612425503652539, + 617368308812423, + 645751934976692, + 646002657804968, + 646999412675240, + 658760814167802, + 674299115538132, + 707795357415042, + 709129645338098, + 715606877552616, + 727501178349554, + 740217592204445, + 746328367957075, + 747061974489806, + 774766039014032, + 793136211156440, + 815785551476871, + 819270218646907, + 848963887376512, + 855277889733619, + 856537344600601, + 872661537916140, + 873741340844162, + 875821258419172, + 892718120209189, + 896260751343942, + 916342460158974, + 925354307527105, + 955632580380849, + 960479220143919, + 971605344762738, + 973405644012468, + 999250735815079, + 1018631769791280, + 1025155812015927, + 1025845738670644, + 1027960613879281, + 1029927517751606, + 1037409076499403, + 1041280033939233, + 1044982327727834, + 1050885671868489, + 1075310329482619, + 1096522422922503, + 1106742051652186, + 1112427053157635, + 1112558009383439, + 1113925965229859, + 1120188413411669, + 1139721214039033, + 1146738350808223, + 1182562187194968, + 1199372832076280, + 1215294699121949, + 1231415193214523, + 1237203006689267, + 1239801318411284, + 1249970025513116, + 1266625644558623, + 1266656407240915, + 1270567543214354, + 1284151799534550, + 1286818488256795, + 1303323680538246, + 1328967688224434, + 1332112042265973, + 1333493819035636, + 1347581721791363, + 1357019919103664, + 1363813349298794, + 1387384593698582, + 1391231162570791, + 1437331822392567, + 1445854969423377, + 1448949389524309, + 1466735371003594, + 1472056993349742, + 1474241810485674, + 1498536125295617, + 1506642273063471, + 1514321203958384, + 1527190825666515, + 1541118156017567, + 1542550183340084, + 1571013838544069, + 1580019657823074, + 1612922913297628, + 1616804425632742, + 1644222069361557, + 1652721631958370, + 1669665601632528, + 1672091842557006, + 1680944260903970, + 1698342533820443, + 1705623168724842, + 1715871032834862, + 1724563270254773, + 1726014444818067, + 1740790467513551, + 1741686672280312, + 1756065242023281, + 1765981654433890, + 1786173343634461, + 1796119195553604, + 1816904589355175, + 1817593137089296, + 1826576981245029, + 1843735434427067 + ], + "molecule": "DNA", + "num": 4294967295, + "seed": 42 + } + ], + "type": "mrnaseq", + "version": 0.4 + } +] \ No newline at end of file diff --git a/tests/test-data/sqlite/README.md b/tests/test-data/sqlite/README.md new file mode 100644 index 0000000000..90b43178e6 --- /dev/null +++ b/tests/test-data/sqlite/README.md @@ -0,0 +1,15 @@ +# test files for SqliteIndex etc. functionality + +`prot.sqlmf` is a SQL version of the manifest in `tests/test-data/prot/all.zip`. + +`delmont-6.csv` is a fixed-up version of `tests/test-data/lca/delmont-6.csv` that works with `sourmash tax`. + +`lca.sqldb` is an `LCA_SqliteDatabase` created with `TARA_ASE_MAG_00031` and `TARA_PSW_MAG_00136`, using the lineage in `delmont-6.csv`. + +`test.taxonomy.db` is a SqliteLineage v1.0 lineage db created with `sourmash tax prepare` from `tests/test-data/tax/test.taxonomy.db`. + +`index.sqldb` is a k=31 sqldb created from `tests/test-data/{47,63}.fa.sig`. + +`shewanella-lineage.csv` is a hand-hacked file containing lineages for 47 and 63. + +`lca2.sqldb` is an `LCA_SqliteDatabase` created from `tests/test-data/{47,63}.fa.sig` and `shewanella-lineage.csv`. diff --git a/tests/test-data/sqlite/delmont-6.csv b/tests/test-data/sqlite/delmont-6.csv new file mode 100644 index 0000000000..418b2cab32 --- /dev/null +++ b/tests/test-data/sqlite/delmont-6.csv @@ -0,0 +1,3 @@ +ident,superkingdom,phylum,class,order,family,genus,species +TARA_ASE_MAG_00031,Bacteria,Proteobacteria,,,Alteromonadaceae,, +TARA_PSW_MAG_00136,Eukaryota,Chlorophyta,Prasinophyceae,,,Ostreococcus,na diff --git a/tests/test-data/sqlite/index.sqldb b/tests/test-data/sqlite/index.sqldb new file mode 100644 index 0000000000..faaed61c33 Binary files /dev/null and b/tests/test-data/sqlite/index.sqldb differ diff --git a/tests/test-data/sqlite/lca-2.sqldb b/tests/test-data/sqlite/lca-2.sqldb new file mode 100644 index 0000000000..de0fad21b7 Binary files /dev/null and b/tests/test-data/sqlite/lca-2.sqldb differ diff --git a/tests/test-data/sqlite/lca.sqldb b/tests/test-data/sqlite/lca.sqldb new file mode 100644 index 0000000000..8ce4dfaa0a Binary files /dev/null and b/tests/test-data/sqlite/lca.sqldb differ diff --git a/tests/test-data/sqlite/prot.sqlmf b/tests/test-data/sqlite/prot.sqlmf new file mode 100644 index 0000000000..a5d4f3475d Binary files /dev/null and b/tests/test-data/sqlite/prot.sqlmf differ diff --git a/tests/test-data/sqlite/shewanella-lineage.csv b/tests/test-data/sqlite/shewanella-lineage.csv new file mode 100644 index 0000000000..1ac709f4b8 --- /dev/null +++ b/tests/test-data/sqlite/shewanella-lineage.csv @@ -0,0 +1,3 @@ +identifiers,superkingdom,phylum,class,order,family,genus,species,strain +NC_009665.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica, +NC_011663.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica, diff --git a/tests/test-data/sqlite/test.taxonomy.db b/tests/test-data/sqlite/test.taxonomy.db new file mode 100644 index 0000000000..babd2206e1 Binary files /dev/null and b/tests/test-data/sqlite/test.taxonomy.db differ diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index cd93349f7e..7340f38dae 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -208,7 +208,7 @@ def test_sig_merge_3_abund_ab_ok(c): c.run_sourmash('sig', 'merge', sig47abund, sig63abund) actual_merge_sig = sourmash.load_one_signature(c.last_result.out) - # @CTB: should check that this merge did what we think it should do! + # CTB: should check that this merge did what we think it should do! @utils.in_tempdir @@ -3284,6 +3284,25 @@ def test_sig_describe_empty(c): assert 'source file: ** no name **' in c.last_result.out +def test_sig_describe_sqldb(runtmp): + # make a sqldb and run fileinfo on it + gcf_all = glob.glob(utils.get_test_data('gather/GCF*.sig')) + sqldb = runtmp.output('some.sqldb') + + runtmp.sourmash('sig', 'cat', '-k', '31', *gcf_all, '-o', sqldb) + + runtmp.sourmash('sig', 'describe', sqldb) + + err = runtmp.last_result.err + print(err) + + out = runtmp.last_result.out + print(out) + + assert 'md5: 4289d4241be8573145282352215ca3c4' in out + assert 'md5: 85c3aeec6457c0b1d210472ddeb67714' in out + + def test_sig_describe_2_csv(runtmp): # output info in CSV spreadsheet c = runtmp @@ -3537,6 +3556,54 @@ def test_sig_manifest_1_zipfile(runtmp): assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list +def test_sig_manifest_1_zipfile_already_exists(runtmp): + # make a manifest from a .zip file; f + protzip = utils.get_test_data('prot/protein.zip') + + mf_csv = runtmp.output('mf.csv') + with open(mf_csv, "w") as fp: + fp.write("hello, world") + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('sig', 'manifest', protzip, '-o', 'mf.csv') + + +def test_sig_manifest_1_zipfile_already_exists_force(runtmp): + # make a manifest from a .zip file + protzip = utils.get_test_data('prot/protein.zip') + + mf_csv = runtmp.output('mf.csv') + with open(mf_csv, "w") as fp: + fp.write("hello, world") + + runtmp.sourmash('sig', 'manifest', protzip, '-o', 'mf.csv', '-f') + + with open(mf_csv, newline='') as csvfp: + manifest = CollectionManifest.load_from_csv(csvfp) + + assert len(manifest) == 2 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + + +def test_sig_manifest_1_zipfile_already_exists_sql(runtmp): + # make a manifest from a .zip file + protzip = utils.get_test_data('prot/protein.zip') + + mf_csv = runtmp.output('mf.mfsql') + runtmp.sourmash('sig', 'manifest', protzip, '-o', 'mf.mfsql', '-F', 'sql') + runtmp.sourmash('sig', 'manifest', protzip, '-o', 'mf.mfsql', '-F', 'sql', + '-f') + + manifest = CollectionManifest.load_from_filename(mf_csv) + + assert len(manifest) == 2 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + + def test_sig_manifest_2_sigfile(runtmp): # make a manifest from a .sig file sigfile = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') @@ -3690,6 +3757,62 @@ def test_sig_manifest_7_allzip_3(runtmp): assert 'dna-sig.noext' in filenames +def test_sig_manifest_8_sqldb(runtmp): + # make a sqldb and then run sig manifest on it. + gcf_all = glob.glob(utils.get_test_data('gather/GCF*.sig')) + sqldb = runtmp.output('some.sqldb') + + runtmp.sourmash('sig', 'cat', '-k', '31', *gcf_all, '-o', sqldb) + + # need to use '--no-rebuild-manifest' with 'sig manifest' on sqldb, + # because it has a manifest but not the _signatures_with_internal + # method to rebuild one ;) + + # so, this should fail... + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('sig', 'manifest', sqldb, '-o', 'mf.csv') + + # ...and this should succeed: + runtmp.sourmash('sig', 'manifest', sqldb, '-o', 'mf.csv', + '--no-rebuild') + + err = runtmp.last_result.err + print(err) + + out = runtmp.last_result.out + print(out) + + assert 'manifest contains 12 signatures total.' in err + assert "wrote manifest to 'mf.csv'" in err + + mf = CollectionManifest.load_from_filename(runtmp.output('mf.csv')) + assert len(mf) == 12 + + +def test_sig_manifest_8_sqldb_out(runtmp): + # make a zip and run manifest out on it to make a sql format manifest. + gcf_all = glob.glob(utils.get_test_data('gather/GCF*.sig')) + zipfile = runtmp.output('some.zip') + + runtmp.sourmash('sig', 'cat', '-k', '31', *gcf_all, '-o', zipfile) + + # ...and this should succeed: + runtmp.sourmash('sig', 'manifest', zipfile, '-o', 'mf.sqldb', + '-F', 'sql') + + err = runtmp.last_result.err + print(err) + + out = runtmp.last_result.out + print(out) + + assert 'manifest contains 12 signatures total.' in err + assert "wrote manifest to 'mf.sqldb'" in err + + mf = CollectionManifest.load_from_filename(runtmp.output('mf.sqldb')) + assert len(mf) == 12 + + def test_sig_kmers_1_dna(runtmp): # test sig kmers on dna seqfile = utils.get_test_data('short.fa') @@ -4420,6 +4543,31 @@ def test_sig_check_1_ksize(runtmp): assert 31 in ksizes +def test_sig_check_1_ksize_output_sql(runtmp): + # basic check functionality with selection for ksize + sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) + picklist = utils.get_test_data('gather/salmonella-picklist.csv') + + runtmp.sourmash('sig', 'check', *sigfiles, '-k', '31', + "--picklist", f"{picklist}::manifest", + "-m", "mf.mfsql", "-F", "sql") + + out_mf = runtmp.output('mf.mfsql') + assert os.path.exists(out_mf) + + # 8 of the 24 should match. + mf = CollectionManifest.load_from_filename(out_mf) + assert len(mf) == 8 + assert mf.conn # check that it's a sqlite manifest! hacky... + + idx = sourmash.load_file_as_index(out_mf) + siglist = list(idx.signatures()) + assert len(siglist) == 8 + ksizes = set([ ss.minhash.ksize for ss in siglist ]) + assert len(ksizes) == 1 + assert 31 in ksizes + + def test_sig_check_2_output_missing(runtmp): # output missing all as identical to input picklist sigfiles = utils.get_test_data('gather/combined.sig') @@ -4510,7 +4658,7 @@ def test_sig_check_2_output_missing_exclude(runtmp): assert "** ERROR: Cannot use an 'exclude' picklist with '-o/--output-missing'" in str(exc) -def test_check_3_no_manifest(runtmp): +def test_sig_check_3_no_manifest(runtmp): # fail check when no manifest, by default sbt = utils.get_test_data('v6.sbt.zip') picklist = utils.get_test_data('v6.sbt.zip.mf.csv') @@ -4526,7 +4674,7 @@ def test_check_3_no_manifest(runtmp): assert "sig check requires a manifest by default, but no manifest present." in err -def test_check_3_no_manifest_ok(runtmp): +def test_sig_check_3_no_manifest_ok(runtmp): # generate manifest if --no-require-manifest sbt = utils.get_test_data('v6.sbt.zip') picklist = utils.get_test_data('v6.sbt.zip.mf.csv') diff --git a/tests/test_cmd_signature_fileinfo.py b/tests/test_cmd_signature_fileinfo.py index 6df3aed33f..33bd649748 100644 --- a/tests/test_cmd_signature_fileinfo.py +++ b/tests/test_cmd_signature_fileinfo.py @@ -3,9 +3,10 @@ """ import shutil import os +import glob +import json import pytest -import json import sourmash_tst_utils as utils from sourmash_tst_utils import SourmashCommandFailed @@ -364,3 +365,70 @@ def test_sig_fileinfo_8_manifest_works_when_moved(runtmp): assert 'has manifest? yes' in out assert 'is database? yes' in out assert 'path filetype: StandaloneManifestIndex' in out + + +def test_sig_fileinfo_9_sqldb_make(runtmp): + # make a sqldb and run fileinfo on it + gcf_all = glob.glob(utils.get_test_data('gather/GCF*.sig')) + sqldb = runtmp.output('some.sqldb') + + runtmp.sourmash('sig', 'cat', '-k', '31', *gcf_all, '-o', sqldb) + + runtmp.sourmash('sig', 'fileinfo', sqldb) + + err = runtmp.last_result.err + print(err) + + out = runtmp.last_result.out + print(out) + + assert "12 sketches with DNA, k=31, scaled=10000 4540 total hashes" in out + + +def test_sig_fileinfo_9_sqldb_exists(runtmp): + # run fileinfo on existing sqldb + sqldb = utils.get_test_data('sqlite/index.sqldb') + runtmp.sourmash('sig', 'fileinfo', sqldb) + + err = runtmp.last_result.err + print(err) + + out = runtmp.last_result.out + print(out) + + assert "path filetype: SqliteIndex" in out + assert "2 sketches with DNA, k=31, scaled=1000 10415 total hashes" in out + + +def test_sig_fileinfo_9_sql_manifest(runtmp): + # run fileinfo on existing sqldb + sqldb = utils.get_test_data('sqlite/prot.sqlmf') + runtmp.sourmash('sig', 'fileinfo', sqldb) + + err = runtmp.last_result.err + print(err) + + out = runtmp.last_result.out + print(out) + + assert "path filetype: StandaloneManifestIndex" in out + assert "num signatures: 7" in out + assert "1 sketches with DNA, k=31, scaled=1000 5238 total hashes" in out + assert "2 sketches with hp, k=19, scaled=100 5184 total hashes" in out + assert "2 sketches with dayhoff, k=19, scaled=100 7945 total hashes" in out + assert "2 sketches with protein, k=19, scaled=100 8214 total hashes" in out + + +def test_sig_fileinfo_9_sql_lca_db(runtmp): + # run fileinfo on existing sqldb + sqldb = utils.get_test_data('sqlite/lca.sqldb') + runtmp.sourmash('sig', 'fileinfo', sqldb) + + err = runtmp.last_result.err + print(err) + + out = runtmp.last_result.out + print(out) + + assert "path filetype: LCA_SqliteDatabase" in out + assert "2 sketches with DNA, k=31, scaled=10000 1431 total hashes" in out diff --git a/tests/test_index_protocol.py b/tests/test_index_protocol.py index ff08d5dc46..22498fcd04 100644 --- a/tests/test_index_protocol.py +++ b/tests/test_index_protocol.py @@ -10,10 +10,11 @@ from sourmash.index import (LinearIndex, ZipFileLinearIndex, LazyLinearIndex, MultiIndex, StandaloneManifestIndex, LazyLoadedIndex) +from sourmash.index.sqlite_index import SqliteIndex from sourmash.index.revindex import RevIndex from sourmash.sbt import SBT, GraphFactory -from sourmash.manifest import CollectionManifest -from sourmash.lca.lca_db import LCA_Database +from sourmash.manifest import CollectionManifest, BaseCollectionManifest +from sourmash.lca.lca_db import LCA_Database, load_single_database import sourmash_tst_utils as utils @@ -127,6 +128,25 @@ def build_lca_index_save_load(runtmp): return sourmash.load_file_as_index(outfile) +def build_lca_index_save_load(runtmp): + db = build_lca_index(runtmp) + outfile = runtmp.output('db.lca.json') + db.save(outfile) + + return sourmash.load_file_as_index(outfile) + + +def build_sqlite_index(runtmp): + filename = runtmp.output('idx.sqldb') + db = SqliteIndex.create(filename) + + siglist = _load_three_sigs() + for ss in siglist: + db.insert(ss) + + return db + + def build_lazy_loaded_index(runtmp): db = build_lca_index(runtmp) outfile = runtmp.output('db.lca.json') @@ -147,6 +167,17 @@ def build_revindex(runtmp): return lidx +def build_lca_index_save_load_sql(runtmp): + db = build_lca_index(runtmp) + outfile = runtmp.output('db.lca.json') + db.save(outfile, format='sql') + + x = load_single_database(outfile) + db_load = x[0] + + return db_load + + # # create a fixture 'index_obj' that is parameterized by all of these # building functions. @@ -161,6 +192,8 @@ def build_revindex(runtmp): build_lca_index, build_sbt_index_save_load, build_lca_index_save_load, + build_sqlite_index, + build_lca_index_save_load_sql, build_lazy_loaded_index, # build_revindex, ] @@ -257,6 +290,20 @@ def test_index_signatures(index_obj): assert ss63.md5sum() in md5s +def test_index_signatures_with_location(index_obj): + # signatures_with_location works? + siglist = list(index_obj.signatures_with_location()) + + ss2, ss47, ss63 = _load_three_sigs() + assert len(siglist) == 3 + + # check md5sums, since 'in' doesn't always work + md5s = set(( ss.md5sum() for ss, loc in siglist )) + assert ss2.md5sum() in md5s + assert ss47.md5sum() in md5s + assert ss63.md5sum() in md5s + + def test_index_len(index_obj): # len works? assert len(index_obj) == 3 @@ -267,6 +314,18 @@ def test_index_bool(index_obj): assert bool(index_obj) +def test_index_location(index_obj): + # location works? + assert str(index_obj.location) + + +def test_index_manifest(index_obj): + # manifest is either None or a BaseCollectionManifest + manifest = index_obj.manifest + if manifest is not None: + assert isinstance(manifest, BaseCollectionManifest) + + def test_index_select_basic(index_obj): # select does the basic thing ok idx = index_obj.select(ksize=31, moltype='DNA', abund=False, @@ -337,7 +396,7 @@ def test_index_gather(index_obj): assert matches[0].signature.minhash == ss47.minhash -def test_linear_gather_threshold_1(index_obj): +def test_index_gather_threshold_1(index_obj): # test gather() method, in some detail ss2, ss47, ss63 = _load_three_sigs() diff --git a/tests/test_lca.py b/tests/test_lca.py index 84e6843fa7..28aa7862ea 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -741,19 +741,19 @@ def test_run_sourmash_lca(): assert status != 0 # no args provided, ok ;) -def test_basic_index(runtmp): +def test_basic_index(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca/delmont-1.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig] + cmd = ['lca', 'index', taxcsv, 'delmont-1', input_sig, '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(lca_db) + assert os.path.exists(lca_db), lca_db assert 'Building LCA database with ksize=31 scaled=10000 moltype=DNA' in runtmp.last_result.err assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err @@ -761,32 +761,52 @@ def test_basic_index(runtmp): assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err -def test_basic_index_bad_spreadsheet(runtmp): +def test_basic_index_twice(runtmp, lca_db_format): + # run 'lca index' twice. + taxcsv = utils.get_test_data('lca/delmont-1.csv') + input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + + cmd = ['lca', 'index', taxcsv, 'delmont-1', input_sig, '-F', lca_db_format] + runtmp.sourmash(*cmd) + + with pytest.raises(SourmashCommandFailed): + cmd = ['lca', 'index', taxcsv, 'delmont-1', input_sig, '-F', lca_db_format] + runtmp.sourmash(*cmd) + + print(cmd) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert 'already exists. Not overwriting.' in runtmp.last_result.err + + +def test_basic_index_bad_spreadsheet(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca/bad-spreadsheet.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(lca_db) + assert os.path.exists(lca_db), lca_db assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err -def test_basic_index_broken_spreadsheet(runtmp): +def test_basic_index_broken_spreadsheet(runtmp, lca_db_format): # duplicate identifiers in this spreadsheet taxcsv = utils.get_test_data('lca/bad-spreadsheet-2.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*cmd) @@ -794,30 +814,31 @@ def test_basic_index_broken_spreadsheet(runtmp): assert "multiple lineages for identifier TARA_ASE_MAG_00031" in runtmp.last_result.err -def test_basic_index_too_many_strains_too_few_species(runtmp): +def test_basic_index_too_many_strains_too_few_species(runtmp, lca_db_format): # explicit test for #841, where 'n_species' wasn't getting counted # if lineage was at strain level resolution. taxcsv = utils.get_test_data('lca/podar-lineage.csv') input_sig = utils.get_test_data('47.fa.sig') - lca_db = runtmp.output('out.lca.json') + lca_db = runtmp.output(f'out.lca.{lca_db_format}') cmd = ['lca', 'index', taxcsv, lca_db, input_sig, - '-C', '3', '--split-identifiers'] + '-C', '3', '--split-identifiers', '-F', lca_db_format] runtmp.sourmash(*cmd) assert not 'error: fewer than 20% of lineages' in runtmp.last_result.err assert runtmp.last_result.status == 0 -def test_basic_index_too_few_species(runtmp): +def test_basic_index_too_few_species(runtmp, lca_db_format): # spreadsheets with too few species should be flagged, unless -f specified taxcsv = utils.get_test_data('lca/tully-genome-sigs.classify.csv') # (these don't really matter, should break on load spreadsheet) input_sig = utils.get_test_data('47.fa.sig') - lca_db = runtmp.output('out.lca.json') + lca_db = runtmp.output(f'out.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-C', '3'] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-C', '3', + '-F', lca_db_format] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*cmd) @@ -825,13 +846,14 @@ def test_basic_index_too_few_species(runtmp): assert runtmp.last_result.status != 0 -def test_basic_index_require_taxonomy(runtmp): +def test_basic_index_require_taxonomy(runtmp, lca_db_format): # no taxonomy in here taxcsv = utils.get_test_data('lca/bad-spreadsheet-3.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - cmd = ['lca', 'index', '--require-taxonomy', taxcsv, lca_db, input_sig] + cmd = ['lca', 'index', '--require-taxonomy', taxcsv, lca_db, input_sig, + '-F', lca_db_format] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*cmd) @@ -839,12 +861,13 @@ def test_basic_index_require_taxonomy(runtmp): assert "ERROR: no hash values found - are there any signatures?" in runtmp.last_result.err -def test_basic_index_column_start(runtmp): +def test_basic_index_column_start(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca/delmont-3.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - cmd = ['lca', 'index', '-C', '3', taxcsv, lca_db, input_sig] + cmd = ['lca', 'index', '-C', '3', taxcsv, lca_db, input_sig, + '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -858,8 +881,9 @@ def test_basic_index_column_start(runtmp): assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err -@utils.in_tempdir -def test_index_empty_sketch_name(c): +def test_index_empty_sketch_name(runtmp, lca_db_format): + c = runtmp + # create two signatures with empty 'name' attributes cmd = ['sketch', 'dna', utils.get_test_data('genome-s12.fa.gz'), utils.get_test_data('genome-s11.fa.gz')] @@ -870,23 +894,31 @@ def test_index_empty_sketch_name(c): sig2 = c.output('genome-s12.fa.gz.sig') assert os.path.exists(sig2) + outfile = f'zzz.lca.{lca_db_format}' + # can we insert them both? taxcsv = utils.get_test_data('lca/delmont-1.csv') - cmd = ['lca', 'index', taxcsv, 'zzz', sig1, sig2] + cmd = ['lca', 'index', taxcsv, outfile, sig1, sig2, '-F', lca_db_format] c.run_sourmash(*cmd) - assert os.path.exists(c.output('zzz.lca.json')) + + assert os.path.exists(c.output(outfile)) print(c.last_result.out) print(c.last_result.err) assert 'WARNING: no lineage provided for 2 sig' in c.last_result.err -def test_basic_index_and_classify_with_tsv_and_gz(runtmp): +def test_basic_index_and_classify_with_tsv_and_gz(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca/delmont-1.tsv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output('delmont-1.lca.json.gz') - cmd = ['lca', 'index', '--tabs', '--no-header', taxcsv, lca_db, input_sig] + if lca_db_format == 'json': + lca_db = runtmp.output(f'delmont-1.lca.json.gz') + else: + lca_db = runtmp.output(f'delmont-1.lca.sql') + + cmd = ['lca', 'index', '--tabs', '--no-header', taxcsv, lca_db, input_sig, + '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -910,12 +942,12 @@ def test_basic_index_and_classify_with_tsv_and_gz(runtmp): assert 'loaded 1 LCA databases' in runtmp.last_result.err -def test_basic_index_and_classify(runtmp): +def test_basic_index_and_classify(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca/delmont-1.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -941,16 +973,51 @@ def test_basic_index_and_classify(runtmp): assert 'loaded 1 LCA databases' in runtmp.last_result.err -def test_index_traverse(runtmp): +def test_basic_index_and_classify_dup_lineage(runtmp, lca_db_format): + taxcsv = utils.get_test_data('lca/tara-delmont-SuppTable3.csv') + input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00007.sig') + input_sig2 = utils.get_test_data('lca/TARA_ANW_MAG_00005.sig') + lca_db = runtmp.output(f'delmont-dup.lca.{lca_db_format}') + + cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, + '-F', lca_db_format, '-f'] + runtmp.sourmash(*cmd) + + print(cmd) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert os.path.exists(lca_db) + + cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig1] + runtmp.sourmash(*cmd) + + print(cmd) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert 'TARA_ASE_MAG_00007,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,,' in runtmp.last_result.out + + cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig2] + runtmp.sourmash(*cmd) + + print(cmd) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert 'TARA_ANW_MAG_00005,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,,' in runtmp.last_result.out + + +def test_index_traverse(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca/delmont-1.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') in_dir = runtmp.output('sigs') os.mkdir(in_dir) shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) - cmd = ['lca', 'index', taxcsv, lca_db, in_dir] + cmd = ['lca', 'index', taxcsv, lca_db, in_dir, '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -965,12 +1032,12 @@ def test_index_traverse(runtmp): assert 'WARNING: 1 duplicate signatures.' not in runtmp.last_result.err -@utils.in_tempdir -def test_index_traverse_force(c): +def test_index_traverse_force(runtmp, lca_db_format): + c = runtmp # test the use of --force to load all files, not just .sig taxcsv = utils.get_test_data('lca/delmont-1.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = c.output('delmont-1.lca.json') + lca_db = c.output(f'delmont-1.lca.{lca_db_format}') in_dir = c.output('sigs') os.mkdir(in_dir) @@ -978,7 +1045,7 @@ def test_index_traverse_force(c): shutil.copyfile(input_sig, os.path.join(in_dir, 'q.txt')) # use --force - cmd = ['lca', 'index', taxcsv, lca_db, in_dir, '-f'] + cmd = ['lca', 'index', taxcsv, lca_db, in_dir, '-f', '-F', lca_db_format] c.run_sourmash(*cmd) out = c.last_result.out @@ -994,17 +1061,18 @@ def test_index_traverse_force(c): assert 'WARNING: 1 duplicate signatures.' not in err -@utils.in_tempdir -def test_index_from_file_cmdline_sig(c): +def test_index_from_file_cmdline_sig(runtmp, lca_db_format): + c = runtmp taxcsv = utils.get_test_data('lca/delmont-1.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = c.output('delmont-1.lca.json') + lca_db = c.output(f'delmont-1.lca.{lca_db_format}') file_list = c.output('sigs.list') with open(file_list, 'wt') as fp: print(input_sig, file=fp) - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '--from-file', file_list] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '--from-file', file_list, + '-F', lca_db_format] c.run_sourmash(*cmd) out = c.last_result.out @@ -1020,17 +1088,19 @@ def test_index_from_file_cmdline_sig(c): assert 'WARNING: 1 duplicate signatures.' in err -@utils.in_tempdir -def test_index_from_file(c): +def test_index_from_file(runtmp, lca_db_format): + c = runtmp + taxcsv = utils.get_test_data('lca/delmont-1.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = c.output('delmont-1.lca.json') + lca_db = c.output(f'delmont-1.lca.{lca_db_format}') file_list = c.output('sigs.list') with open(file_list, 'wt') as fp: print(input_sig, file=fp) - cmd = ['lca', 'index', taxcsv, lca_db, '--from-file', file_list] + cmd = ['lca', 'index', taxcsv, lca_db, '--from-file', file_list, + '-F', lca_db_format] c.run_sourmash(*cmd) out = c.last_result.out @@ -1045,14 +1115,15 @@ def test_index_from_file(c): assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err -@utils.in_tempdir -def test_index_fail_on_num(c): +def test_index_fail_on_num(runtmp, lca_db_format): + c = runtmp # lca index should yield a decent error message when attempted on 'num' sigfile = utils.get_test_data('num/63.fa.sig') taxcsv = utils.get_test_data('lca/podar-lineage.csv') with pytest.raises(SourmashCommandFailed): - c.run_sourmash('lca', 'index', taxcsv, 'xxx.lca.json', sigfile, '-C', '3') + c.run_sourmash('lca', 'index', taxcsv, f'xxx.lca.{lca_db_format}', sigfile, + '-C', '3', '-F', lca_db_format) err = c.last_result.err print(err) @@ -1061,12 +1132,13 @@ def test_index_fail_on_num(c): assert 'ERROR: cannot downsample signature; is it a scaled signature?' in err -def test_index_traverse_real_spreadsheet_no_report(runtmp): +def test_index_traverse_real_spreadsheet_no_report(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca/tara-delmont-SuppTable3.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-f'] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-f', + '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1083,14 +1155,14 @@ def test_index_traverse_real_spreadsheet_no_report(runtmp): assert '(You can use --report to generate a detailed report.)' in runtmp.last_result.err -def test_index_traverse_real_spreadsheet_report(runtmp): +def test_index_traverse_real_spreadsheet_report(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca/tara-delmont-SuppTable3.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') report_loc = runtmp.output('report.txt') cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '--report', - report_loc, '-f'] + report_loc, '-f', '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1125,7 +1197,7 @@ def test_single_classify(runtmp): def test_single_classify_to_output(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') + db1 = utils.get_test_data(f'lca/delmont-1.lca.json') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') cmd = ['lca', 'classify', '--db', db1, '--query', input_sig, @@ -1144,7 +1216,7 @@ def test_single_classify_to_output(runtmp): def test_single_classify_to_output_no_name(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') + db1 = utils.get_test_data(f'lca/delmont-1.lca.json') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') ss = sourmash.load_one_signature(input_sig, ksize=31) @@ -1170,7 +1242,7 @@ def test_single_classify_to_output_no_name(runtmp): def test_single_classify_empty(runtmp): - db1 = utils.get_test_data('lca/both.lca.json') + db1 = utils.get_test_data(f'lca/both.lca.json') input_sig = utils.get_test_data('GCF_000005845.2_ASM584v2_genomic.fna.gz.sig') cmd = ['lca', 'classify', '--db', db1, '--query', input_sig] @@ -1186,7 +1258,7 @@ def test_single_classify_empty(runtmp): def test_single_classify_traverse(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') + db1 = utils.get_test_data(f'lca/delmont-1.lca.json') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') in_dir = runtmp.output('sigs') os.mkdir(in_dir) @@ -1206,7 +1278,7 @@ def test_single_classify_traverse(runtmp): def test_multi_query_classify_traverse(runtmp): # both.lca.json is built from both dir and dir2 - db1 = utils.get_test_data('lca/both.lca.json') + db1 = utils.get_test_data(f'lca/both.lca.json') dir1 = utils.get_test_data('lca/dir1') dir2 = utils.get_test_data('lca/dir2') @@ -1217,7 +1289,7 @@ def test_multi_query_classify_traverse(runtmp): print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(utils.get_test_data('lca/classify-by-both.csv'), 'rt') as fp: + with open(utils.get_test_data('lca/classify-by-both.csv')) as fp: fp_lines = fp.readlines() out_lines = runtmp.last_result.out.splitlines() @@ -1247,7 +1319,7 @@ def test_multi_query_classify_query_from_file(c): c.run_sourmash(*cmd) out = c.last_result.out - with open(utils.get_test_data('lca/classify-by-both.csv'), 'rt') as fp: + with open(utils.get_test_data('lca/classify-by-both.csv')) as fp: fp_lines = fp.readlines() out_lines = out.splitlines() @@ -1262,7 +1334,7 @@ def test_multi_query_classify_query_from_file(c): @utils.in_tempdir def test_multi_query_classify_query_from_file_and_query(c): # both.lca.json is built from both dir and dir2 - db1 = utils.get_test_data('lca/both.lca.json') + db1 = utils.get_test_data(f'lca/both.lca.json') dir1_glob = utils.get_test_data('lca/dir1/*.sig') dir1_files = glob.glob(dir1_glob) dir2_glob = utils.get_test_data('lca/dir2/*.sig') @@ -1292,8 +1364,8 @@ def test_multi_query_classify_query_from_file_and_query(c): def test_multi_db_multi_query_classify_traverse(runtmp): # two halves of both.lca.json, see above test. - db1 = utils.get_test_data('lca/dir1.lca.json') - db2 = utils.get_test_data('lca/dir2.lca.json') + db1 = utils.get_test_data(f'lca/dir1.lca.json') + db2 = utils.get_test_data(f'lca/dir2.lca.json') dir1 = utils.get_test_data('lca/dir1') dir2 = utils.get_test_data('lca/dir2') @@ -1316,12 +1388,12 @@ def test_multi_db_multi_query_classify_traverse(runtmp): assert line1.strip() == line2.strip(), (line1, line2) -def test_unassigned_internal_index_and_classify(runtmp): +def test_unassigned_internal_index_and_classify(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca/delmont-4.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1347,12 +1419,12 @@ def test_unassigned_internal_index_and_classify(runtmp): assert 'loaded 1 LCA databases' in runtmp.last_result.err -def test_unassigned_last_index_and_classify(runtmp): +def test_unassigned_last_index_and_classify(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca/delmont-5.csv') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1378,13 +1450,14 @@ def test_unassigned_last_index_and_classify(runtmp): assert 'loaded 1 LCA databases' in runtmp.last_result.err -def test_index_and_classify_internal_unassigned_multi(runtmp): +def test_index_and_classify_internal_unassigned_multi(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca/delmont-6.csv') input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, + '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1424,17 +1497,18 @@ def test_index_and_classify_internal_unassigned_multi(runtmp): assert 'loaded 1 LCA databases' in runtmp.last_result.err -@utils.in_tempdir -def test_classify_majority_vote_1(c): +def test_classify_majority_vote_1(runtmp, lca_db_format): # classify merged signature using lca should yield no results + c = runtmp # build database taxcsv = utils.get_test_data('lca/delmont-6.csv') input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = c.output('delmont-1.lca.json') + lca_db = c.output(f'delmont-1.lca.{lca_db_format}') - c.run_sourmash('lca', 'index', taxcsv, lca_db, input_sig1, input_sig2) + c.run_sourmash('lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, + '-F', lca_db_format) print(c.last_command) print(c.last_result.out) @@ -1464,18 +1538,20 @@ def test_classify_majority_vote_1(c): -@utils.in_tempdir -def test_classify_majority_vote_2(c): +def test_classify_majority_vote_2(runtmp, lca_db_format): # classify same signature with same database using --majority # should yield results + c = runtmp + # build database taxcsv = utils.get_test_data('lca/delmont-6.csv') input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = c.output('delmont-1.lca.json') + lca_db = c.output(f'delmont-1.lca.{lca_db_format}') - c.run_sourmash('lca', 'index', taxcsv, lca_db, input_sig1, input_sig2) + c.run_sourmash('lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, + '-F', lca_db_format) print(c.last_command) print(c.last_result.out) @@ -1504,17 +1580,18 @@ def test_classify_majority_vote_2(c): assert 'loaded 1 LCA databases' in c.last_result.err -@utils.in_tempdir -def test_classify_majority_vote_3(c): +def test_classify_majority_vote_3(runtmp, lca_db_format): # classify signature with nothing in counts + c = runtmp # build database taxcsv = utils.get_test_data('lca/delmont-6.csv') input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = c.output('delmont-1.lca.json') + lca_db = c.output(f'delmont-1.lca.{lca_db_format}') - c.run_sourmash('lca', 'index', taxcsv, lca_db, input_sig1, input_sig2) + c.run_sourmash('lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, + '-F', lca_db_format) print(c.last_command) print(c.last_result.out) @@ -1543,7 +1620,7 @@ def test_classify_majority_vote_3(c): def test_multi_db_classify(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') + db1 = utils.get_test_data(f'lca/delmont-1.lca.json') db2 = utils.get_test_data('lca/delmont-2.lca.json') input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') @@ -1560,13 +1637,13 @@ def test_multi_db_classify(runtmp): assert 'loaded 2 LCA databases' in runtmp.last_result.err -def test_classify_unknown_hashes(runtmp): +def test_classify_unknown_hashes(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca-root/tax.csv') input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output('lca-root.lca.json') + lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig2, '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1704,13 +1781,13 @@ def test_single_summarize_to_output_check_filename(runtmp): print(outdata) -def test_summarize_unknown_hashes_to_output_check_total_counts(runtmp): +def test_summarize_unknown_hashes_to_output_check_total_counts(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca-root/tax.csv') input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output('lca-root.lca.json') + lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig2, '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1761,13 +1838,14 @@ def test_single_summarize_scaled(runtmp): assert '100.0% 27 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' -def test_multi_summarize_with_unassigned_singleton(runtmp): +def test_multi_summarize_with_unassigned_singleton(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca/delmont-6.csv') input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, + '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1816,13 +1894,14 @@ def remove_line_startswith(x, check=None): assert not out_lines -def test_summarize_to_root(runtmp): +def test_summarize_to_root(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca-root/tax.csv') input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output('lca-root.lca.json') + lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, + '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1845,13 +1924,13 @@ def test_summarize_to_root(runtmp): assert '21.4% 27 (root)' in runtmp.last_result.out -def test_summarize_unknown_hashes(runtmp): +def test_summarize_unknown_hashes(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca-root/tax.csv') input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output('lca-root.lca.json') + lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig2, '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1873,13 +1952,14 @@ def test_summarize_unknown_hashes(runtmp): assert '11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I' in runtmp.last_result.out -def test_summarize_to_root_abund(runtmp): +def test_summarize_to_root_abund(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca-root/tax.csv') input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output('lca-root.lca.json') + lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, + '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1901,13 +1981,13 @@ def test_summarize_to_root_abund(runtmp): assert '21.1% 27 (root)' in runtmp.last_result.out -def test_summarize_unknown_hashes_abund(runtmp): +def test_summarize_unknown_hashes_abund(runtmp, lca_db_format): taxcsv = utils.get_test_data('lca-root/tax.csv') input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output('lca-root.lca.json') + lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig2, '-F', lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -2013,18 +2093,18 @@ def test_rankinfo_on_single(runtmp): assert not lines -def test_rankinfo_no_tax(runtmp): +def test_rankinfo_no_tax(runtmp, lca_db_format): # note: TARA_PSW_MAG_00136 is _not_ in delmont-1.csv. taxcsv = utils.get_test_data('lca/delmont-1.csv') input_sig = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = runtmp.output('delmont-1.lca.json') + lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - cmd = ['lca', 'index', taxcsv, lca_db, input_sig] + cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] runtmp.sourmash(*cmd) - print(cmd) - print(runtmp.last_result.out) - print(runtmp.last_result.err) + print('cmd:', cmd) + print('out:', runtmp.last_result.out) + print('err:', runtmp.last_result.err) assert os.path.exists(lca_db) @@ -2117,33 +2197,37 @@ def test_compare_csv_real(runtmp): assert '0 incompatible at rank species' in runtmp.last_result.err -@utils.in_tempdir -def test_incompat_lca_db_ksize_2(c): +def test_incompat_lca_db_ksize_2(runtmp, lca_db_format): # test on gather - create a database with ksize of 25 + c = runtmp testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz') c.run_sourmash('sketch', 'dna', '-p', 'k=25,scaled=1000', testdata1, '-o', 'test_db.sig') print(c) c.run_sourmash('lca', 'index', utils.get_test_data('lca/delmont-1.csv',), - 'test.lca.json', 'test_db.sig', - '-k', '25', '--scaled', '10000') + f'test.lca.{lca_db_format}', 'test_db.sig', + '-k', '25', '--scaled', '10000', + '-F', lca_db_format) print(c) # this should fail: the LCA database has ksize 25, and the query sig has # no compatible ksizes. with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('gather', utils.get_test_data('lca/TARA_ASE_MAG_00031.sig'), 'test.lca.json') + c.run_sourmash('gather', utils.get_test_data('lca/TARA_ASE_MAG_00031.sig'), f'test.lca.{lca_db_format}') err = c.last_result.err print(err) - assert "ERROR: cannot use 'test.lca.json' for this query." in err - assert "ksize on this database is 25; this is different from requested ksize of 31" + if lca_db_format == 'sql': + assert "no compatible signatures found in 'test.lca.sql'" in err + else: + assert "ERROR: cannot use 'test.lca.json' for this query." in err + assert "ksize on this database is 25; this is different from requested ksize of 31" -@utils.in_tempdir -def test_lca_index_empty(c): +def test_lca_index_empty(runtmp, lca_db_format): + c = runtmp # test lca index with an empty taxonomy CSV, followed by a load & gather. sig2file = utils.get_test_data('2.fa.sig') sig47file = utils.get_test_data('47.fa.sig') @@ -2156,11 +2240,12 @@ def test_lca_index_empty(c): fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') # index! - c.run_sourmash('lca', 'index', 'empty.csv', 'xxx.lca.json', - sig2file, sig47file, sig63file, '--scaled', '1000') + c.run_sourmash('lca', 'index', 'empty.csv', 'xxx', + sig2file, sig47file, sig63file, '--scaled', '1000', + '-F', lca_db_format) # can we load and search? - lca_db_filename = c.output('xxx.lca.json') + lca_db_filename = c.output(f'xxx.lca.{lca_db_format}') db, ksize, scaled = lca_utils.load_single_database(lca_db_filename) results = db.gather(sig63) @@ -2363,18 +2448,20 @@ def test_lca_db_protein_save_load(c): assert results[0][0] == 1.0 -@utils.in_tempdir -def test_lca_db_protein_command_index(c): +def test_lca_db_protein_command_index(runtmp, lca_db_format): # test command-line creation of LCA database with protein sigs + c = runtmp + sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') lineages = utils.get_test_data('prot/gtdb-subset-lineages.csv') - db_out = c.output('protein.lca.json') + db_out = c.output(f'protein.lca.{lca_db_format}') c.run_sourmash('lca', 'index', lineages, db_out, sigfile1, sigfile2, '-C', '2', '--split-identifiers', '--require-taxonomy', - '--scaled', '100', '-k', '19', '--protein') + '--scaled', '100', '-k', '19', '--protein', + '-F', lca_db_format) x = sourmash.lca.lca_db.load_single_database(db_out) db2 = x[0] @@ -2472,18 +2559,20 @@ def test_lca_db_hp_save_load(c): assert results[0][0] == 1.0 -@utils.in_tempdir -def test_lca_db_hp_command_index(c): +def test_lca_db_hp_command_index(runtmp, lca_db_format): # test command-line creation of LCA database with hp sigs + c = runtmp + sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') sigfile2 = utils.get_test_data('prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') lineages = utils.get_test_data('prot/gtdb-subset-lineages.csv') - db_out = c.output('hp.lca.json') + db_out = c.output(f'hp.lca.{lca_db_format}') c.run_sourmash('lca', 'index', lineages, db_out, sigfile1, sigfile2, '-C', '2', '--split-identifiers', '--require-taxonomy', - '--scaled', '100', '-k', '19', '--hp') + '--scaled', '100', '-k', '19', '--hp', + '-F', lca_db_format) x = sourmash.lca.lca_db.load_single_database(db_out) db2 = x[0] @@ -2581,18 +2670,20 @@ def test_lca_db_dayhoff_save_load(c): assert results[0][0] == 1.0 -@utils.in_tempdir -def test_lca_db_dayhoff_command_index(c): +def test_lca_db_dayhoff_command_index(runtmp, lca_db_format): # test command-line creation of LCA database with dayhoff sigs + c = runtmp + sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') sigfile2 = utils.get_test_data('prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') lineages = utils.get_test_data('prot/gtdb-subset-lineages.csv') - db_out = c.output('dayhoff.lca.json') + db_out = c.output(f'dayhoff.lca.{lca_db_format}') c.run_sourmash('lca', 'index', lineages, db_out, sigfile1, sigfile2, '-C', '2', '--split-identifiers', '--require-taxonomy', - '--scaled', '100', '-k', '19', '--dayhoff') + '--scaled', '100', '-k', '19', '--dayhoff', + '-F', lca_db_format) x = sourmash.lca.lca_db.load_single_database(db_out) db2 = x[0] @@ -2630,9 +2721,9 @@ def test_lca_db_dayhoff_command_search(c): assert 'the recovered matches hit 100.0% of the query' in c.last_result.out -def test_lca_index_with_picklist(runtmp): +def test_lca_index_with_picklist(runtmp, lca_db_format): gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - outdb = runtmp.output('gcf.lca.json') + outdb = runtmp.output(f'gcf.lca.{lca_db_format}') picklist = utils.get_test_data('gather/thermotoga-picklist.csv') # create an empty spreadsheet @@ -2640,7 +2731,8 @@ def test_lca_index_with_picklist(runtmp): fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, - '-k', '21', '--picklist', f"{picklist}:md5:md5") + '-k', '21', '--picklist', f"{picklist}:md5:md5", + '-F', lca_db_format) out = runtmp.last_result.out err = runtmp.last_result.err @@ -2658,9 +2750,9 @@ def test_lca_index_with_picklist(runtmp): assert 'Thermotoga' in ss.name -def test_lca_index_with_picklist_exclude(runtmp): +def test_lca_index_with_picklist_exclude(runtmp, lca_db_format): gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - outdb = runtmp.output('gcf.lca.json') + outdb = runtmp.output(f'gcf.lca.{lca_db_format}') picklist = utils.get_test_data('gather/thermotoga-picklist.csv') # create an empty spreadsheet @@ -2668,7 +2760,8 @@ def test_lca_index_with_picklist_exclude(runtmp): fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, - '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude") + '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude", + '-F', lca_db_format) out = runtmp.last_result.out err = runtmp.last_result.err @@ -2676,16 +2769,73 @@ def test_lca_index_with_picklist_exclude(runtmp): print(out) print(err) - assert "for given picklist, found 9 matches by excluding 9 distinct values" in err - assert "WARNING: 3 missing picklist values." - assert "WARNING: no lineage provided for 9 signatures" in err - siglist = list(sourmash.load_file_as_signatures(outdb)) assert len(siglist) == 9 for ss in siglist: assert 'Thermotoga' not in ss.name +def test_lca_index_select_with_picklist(runtmp, lca_db_format): + # check what happens with picklists after index + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + outdb = runtmp.output(f'gcf.lca.{lca_db_format}') + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + # create an empty spreadsheet + with open(runtmp.output('empty.csv'), 'wt') as fp: + fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') + + runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, + '-k', '21', '-F', lca_db_format) + + out = runtmp.last_result.out + err = runtmp.last_result.err + + print(out) + print(err) + + idx = sourmash.load_file_as_index(outdb) + picklist_obj = SignaturePicklist.from_picklist_args(f"{picklist}:md5:md5") + picklist_obj.load(picklist_obj.pickfile, picklist_obj.column_name) + + idx = idx.select(picklist=picklist_obj) + + siglist = list(idx.signatures()) + assert len(siglist) == 3 + for ss in siglist: + assert 'Thermotoga' in ss.name + + +def test_lca_index_select_with_picklist_exclude(runtmp, lca_db_format): + # check what happens with picklists after index + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + outdb = runtmp.output(f'gcf.lca.{lca_db_format}') + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + # create an empty spreadsheet + with open(runtmp.output('empty.csv'), 'wt') as fp: + fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') + + runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, + '-k', '21', '-F', lca_db_format) + + out = runtmp.last_result.out + err = runtmp.last_result.err + + print(out) + print(err) + + idx = sourmash.load_file_as_index(outdb) + picklist_obj = SignaturePicklist.from_picklist_args(f"{picklist}:md5:md5:exclude") + picklist_obj.load(picklist_obj.pickfile, picklist_obj.column_name) + idx = idx.select(picklist=picklist_obj) + + siglist = list(idx.signatures()) + assert len(siglist) == 9 + for ss in siglist: + assert 'Thermotoga' not in ss.name + + def test_lca_jaccard_ordering(): # this tests a tricky situation where for three sketches A, B, C, # |A intersect B| is greater than |A intersect C| @@ -2730,3 +2880,21 @@ def _intersect(x, y): assert sr[0].score == 1.0 assert sr[1].signature == ss_c assert sr[1].score == 0.2 + + +def test_lca_db_protein_save_twice(runtmp, lca_db_format): + # test save twice + sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + + sig1 = sourmash.load_one_signature(sigfile1) + sig2 = sourmash.load_one_signature(sigfile2) + + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='protein') + assert db.insert(sig1) + assert db.insert(sig2) + + db.save(runtmp.output('xxx'), format=lca_db_format) + + with pytest.raises(ValueError): + db.save(runtmp.output('xxx'), format=lca_db_format) diff --git a/tests/test_lca_db_protocol.py b/tests/test_lca_db_protocol.py index daca0f2f62..a3fc57b085 100644 --- a/tests/test_lca_db_protocol.py +++ b/tests/test_lca_db_protocol.py @@ -38,7 +38,20 @@ def build_json_lca_db(runtmp): db = build_inmem_lca_db(runtmp) db_out = runtmp.output('protein.lca.json') - db.save(db_out) + db.save(db_out, format='json') + + x = load_single_database(db_out) + db_load = x[0] + + return db_load + + +def build_sql_lca_db(runtmp): + # test saved/loaded SQL database + db = build_inmem_lca_db(runtmp) + db_out = runtmp.output('protein.lca.json') + + db.save(db_out, format='sql') x = load_single_database(db_out) db_load = x[0] @@ -47,7 +60,8 @@ def build_json_lca_db(runtmp): @pytest.fixture(params=[build_inmem_lca_db, - build_json_lca_db]) + build_json_lca_db, + build_sql_lca_db]) def lca_db_obj(request, runtmp): build_fn = request.param @@ -106,3 +120,18 @@ def test_get_identifiers_for_hashval_2(lca_db_obj): assert 'GCA_001593925' in all_idents assert 'GCA_001593935' in all_idents + + +def test_downsample_scaled(lca_db_obj): + # check the downsample_scaled method + assert lca_db_obj.scaled == 100 + lca_db_obj.downsample_scaled(500) + assert lca_db_obj.scaled == 500 + + +def test_downsample_scaled_fail(lca_db_obj): + # check the downsample_scaled method - should fail if lower scaled. + assert lca_db_obj.scaled == 100 + + with pytest.raises(ValueError): + lca_db_obj.downsample_scaled(50) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index b34cbe4dc4..3f63d4e88f 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -80,6 +80,35 @@ def test_manifest_to_picklist(): assert len(new_manifest) == len(manifest) +def test_manifest_compare(): + # test saving and loading manifests + protzip = utils.get_test_data('prot/protein.zip') + + loader = sourmash.load_file_as_index(protzip) + manifest = loader.manifest + + # equal + rows = list(manifest.rows) + + equal_mf = index.CollectionManifest(rows) + assert equal_mf == manifest + + # not equal / shorter + rows = list(manifest.rows) + rows = rows[:-1] + + short_mf = index.CollectionManifest(rows) + assert short_mf != manifest + + # not equal / diff values + rows = list(manifest.rows) + rows[0] = dict(rows[0]) + rows[0]['internal_location'] += '.foo' + + short_mf = index.CollectionManifest(rows) + assert short_mf != manifest + + def test_save_load_manifest(): # test saving and loading manifests protzip = utils.get_test_data('prot/protein.zip') @@ -115,3 +144,21 @@ def test_save_load_manifest(): # manifest 2 in manifest? for row in manifest2.rows: assert pick1.matches_manifest_row(row) + + # equal? + assert manifest == manifest2 + + # not equal / shorter + rows = list(manifest.rows) + rows = rows[1:] + + short_mf = index.CollectionManifest(rows) + assert short_mf != manifest + + # not equal / diff values + rows = list(manifest.rows) + rows[0] = dict(rows[0]) + rows[0]['internal_location'] += '.foo' + + short_mf = index.CollectionManifest(rows) + assert short_mf != manifest diff --git a/tests/test_manifest_protocol.py b/tests/test_manifest_protocol.py index bbfa7691a0..3f8abeeb65 100644 --- a/tests/test_manifest_protocol.py +++ b/tests/test_manifest_protocol.py @@ -8,6 +8,7 @@ import sourmash from sourmash.manifest import BaseCollectionManifest, CollectionManifest +from sourmash.index.sqlite_index import SqliteCollectionManifest def build_simple_manifest(runtmp): @@ -19,6 +20,17 @@ def build_simple_manifest(runtmp): return mf +def build_sqlite_manifest(runtmp): + # return the manifest from prot/all.zip + filename = utils.get_test_data('prot/all.zip') + idx = sourmash.load_file_as_index(filename) + mf = idx.manifest + + # build sqlite manifest from this 'un + mfdb = runtmp.output('test.sqlmf') + return SqliteCollectionManifest.load_from_manifest(mf, dbfile=mfdb) + + def save_load_manifest(runtmp): # save/load the manifest from a CSV. mf = build_simple_manifest(runtmp) @@ -31,7 +43,8 @@ def save_load_manifest(runtmp): @pytest.fixture(params=[build_simple_manifest, - save_load_manifest]) + save_load_manifest, + build_sqlite_manifest]) def manifest_obj(request, runtmp): build_fn = request.param diff --git a/tests/test_sqlite_index.py b/tests/test_sqlite_index.py new file mode 100644 index 0000000000..ea64137aae --- /dev/null +++ b/tests/test_sqlite_index.py @@ -0,0 +1,865 @@ +"Tests for SqliteIndex, SqliteCollectionManifest, and LCA_SqliteDatabase" +import os +import pytest +import shutil +import sqlite3 + +import sourmash +from sourmash.exceptions import IndexNotSupported +from sourmash.index.sqlite_index import (SqliteIndex, load_sqlite_index, + SqliteCollectionManifest, + LCA_SqliteDatabase) + +from sourmash.index import StandaloneManifestIndex +from sourmash import load_one_signature, SourmashSignature +from sourmash.picklist import SignaturePicklist, PickStyle +from sourmash.manifest import CollectionManifest +from sourmash.tax.tax_utils import MultiLineageDB + +import sourmash_tst_utils as utils +from sourmash_tst_utils import SourmashCommandFailed +from sourmash import sqlite_utils + + +def test_sqlite_index_prefetch_empty(): + # check that an exception is raised upon for an empty database + sig2 = utils.get_test_data('2.fa.sig') + ss2 = sourmash.load_one_signature(sig2, ksize=31) + + sqlidx = SqliteIndex.create(":memory:") + + # since this is a generator, we need to actually ask for a value to + # get exception raised. + g = sqlidx.prefetch(ss2, threshold_bp=0) + with pytest.raises(ValueError) as e: + next(g) + + assert "no signatures to search" in str(e.value) + + +def test_sqlite_index_bad_version(runtmp): + # create a sqlite database with a bad index version in the + # sourmash_internal table, see what happens :) + + dbfile = runtmp.output('xyz.sqldb') + conn = sqlite3.connect(dbfile) + c = conn.cursor() + + SqliteIndex._create_tables(c) + + # 0.9 doesn't exist/is bad version + c.execute('UPDATE sourmash_internal SET value=? WHERE key=?', + ('0.9', 'SqliteIndex')) + + conn.commit() + + with pytest.raises(IndexNotSupported): + idx = sourmash.load_file_as_index(dbfile) + + +def test_sqlite_index_bad_version_unique(runtmp): + # try to insert duplicate sqlite index info into sourmash_internal; fail + + dbfile = runtmp.output('xyz.sqldb') + conn = sqlite3.connect(dbfile) + c = conn.cursor() + + SqliteIndex._create_tables(c) + + # can't insert duplicate key + with pytest.raises(sqlite3.IntegrityError): + c.execute('INSERT INTO sourmash_internal (value, key) VALUES (?, ?)', + ('1.1', 'SqliteIndex')) + + +def test_index_search_subj_scaled_is_lower(): + # check that subject sketches are appropriately downsampled + sigfile = utils.get_test_data('scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz') + ss = sourmash.load_one_signature(sigfile) + + # double check :) + assert ss.minhash.scaled == 100 + + # build a new query that has a scaled of 1000 + qs = SourmashSignature(ss.minhash.downsample(scaled=1000)) + + # create Index to search + sqlidx = SqliteIndex.create(":memory:") + sqlidx.insert(ss) + + # search! + results = list(sqlidx.search(qs, threshold=0)) + assert len(results) == 1 + # original signature (not downsampled) is returned + assert results[0].signature == ss + + +def test_sqlite_index_save_load(runtmp): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + filename = runtmp.output('foo') + sqlidx = SqliteIndex.create(filename) + sqlidx.insert(ss2) + sqlidx.insert(ss47) + sqlidx.insert(ss63) + + sqlidx.close() + + sqlidx2 = SqliteIndex.load(filename) + + # now, search for sig2 + sr = sqlidx2.search(ss2, threshold=1.0) + print([s[1].name for s in sr]) + assert len(sr) == 1 + assert sr[0][1] == ss2 + + +def test_sqlite_index_multik_select(): + # this loads three ksizes, 21/31/51 + sig2 = utils.get_test_data('2.fa.sig') + siglist = sourmash.load_file_as_signatures(sig2) + + sqlidx = SqliteIndex.create(":memory:") + for ss in siglist: + sqlidx.insert(ss) + + # select most specifically + sqlidx2 = sqlidx.select(ksize=31, moltype='DNA') + assert len(sqlidx2) == 1 + + # all are DNA: + sqlidx2 = sqlidx.select(moltype='DNA') + assert len(sqlidx2) == 3 + + +def test_sqlite_index_num_select(): + # this will fail on 'num' select, which is not allowed + sqlidx = SqliteIndex.create(":memory:") + with pytest.raises(ValueError): + sqlidx.select(num=100) + + +def test_sqlite_index_abund_select(): + # this will fail on 'track_abundance' select, which is not allowed + sqlidx = SqliteIndex.create(":memory:") + with pytest.raises(ValueError): + sqlidx.select(track_abundance=True) + + +def test_sqlite_index_insert_num_fail(): + # cannot insert 'num' signatures + sqlidx = SqliteIndex.create(":memory:") + + sig47 = utils.get_test_data('num/47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + assert ss47.minhash.num != 0 + + with pytest.raises(ValueError) as exc: + sqlidx.insert(ss47) + + assert "cannot store 'num' signatures in SqliteIndex" in str(exc) + + +def test_sqlite_index_insert_abund_fail(): + # cannot insert 'num' signatures + sqlidx = SqliteIndex.create(":memory:") + + sig47 = utils.get_test_data('track_abund/47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + + with pytest.raises(ValueError) as exc: + sqlidx.insert(ss47) + + assert "cannot store signatures with abundance in SqliteIndex" in str(exc) + + +def test_sqlite_index_moltype_multi_fail(): + # check that we cannot store sigs with multiple scaled values. + + # this loads multiple ksizes (19, 31) and moltypes (DNA, protein, hp, etc) + filename = utils.get_test_data('prot/all.zip') + siglist = sourmash.load_file_as_signatures(filename) + siglist = list(siglist) + + sqlidx = SqliteIndex.create(":memory:") + + sqlidx.insert(siglist[0]) + assert sqlidx.scaled == 100 + + with pytest.raises(ValueError) as exc: + for ss in siglist: + sqlidx.insert(ss) + + assert "this database can only store scaled values=100" in str(exc) + + +def test_sqlite_index_picklist_select(): + # test select with a picklist + + # this loads three ksizes, 21/31/51 + sig2 = utils.get_test_data('2.fa.sig') + siglist = sourmash.load_file_as_signatures(sig2) + + sqlidx = SqliteIndex.create(":memory:") + for ss in siglist: + sqlidx.insert(ss) + + # construct a picklist... + picklist = SignaturePicklist('md5prefix8') + picklist.init(['f3a90d4e']) + + # select on picklist + sqlidx2 = sqlidx.select(picklist=picklist) + assert len(sqlidx2) == 1 + ss = list(sqlidx2.signatures())[0] + assert ss.minhash.ksize == 31 + assert ss.md5sum().startswith('f3a90d4e55') + + +def test_sqlite_index_picklist_select_exclude(): + # test select with a picklist, but exclude + + # this loads three ksizes, 21/31/51 + sig2 = utils.get_test_data('2.fa.sig') + siglist = sourmash.load_file_as_signatures(sig2) + + sqlidx = SqliteIndex.create(":memory:") + for ss in siglist: + sqlidx.insert(ss) + + # construct a picklist... + picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE) + picklist.init(['f3a90d4e']) + + # select on picklist + sqlidx2 = sqlidx.select(picklist=picklist) + assert len(sqlidx2) == 2 + md5s = set() + ksizes = set() + for ss in list(sqlidx2.signatures()): + md5s.add(ss.md5sum()) + ksizes.add(ss.minhash.ksize) + assert md5s == set(['f372e47893edd349e5956f8b0d8dcbf7','43f3b48e59443092850964d355a20ac0']) + assert ksizes == set([21,51]) + + +def test_sqlite_jaccard_ordering(): + # this tests a tricky situation where for three sketches A, B, C, + # |A intersect B| is greater than |A intersect C| + # _but_ + # |A jaccard B| is less than |A intersect B| + a = sourmash.MinHash(ksize=31, n=0, scaled=2) + b = a.copy_and_clear() + c = a.copy_and_clear() + + a.add_many([1, 2, 3, 4]) + b.add_many([1, 2, 3] + list(range(10, 30))) + c.add_many([1, 5]) + + def _intersect(x, y): + return x.intersection_and_union_size(y)[0] + + print('a intersect b:', _intersect(a, b)) + print('a intersect c:', _intersect(a, c)) + print('a jaccard b:', a.jaccard(b)) + print('a jaccard c:', a.jaccard(c)) + assert _intersect(a, b) > _intersect(a, c) + assert a.jaccard(b) < a.jaccard(c) + + # thresholds to use: + assert a.jaccard(b) < 0.15 + assert a.jaccard(c) > 0.15 + + # now - make signatures, try out :) + ss_a = sourmash.SourmashSignature(a, name='A') + ss_b = sourmash.SourmashSignature(b, name='B') + ss_c = sourmash.SourmashSignature(c, name='C') + + sqlidx = SqliteIndex.create(":memory:") + sqlidx.insert(ss_a) + sqlidx.insert(ss_b) + sqlidx.insert(ss_c) + + sr = sqlidx.search(ss_a, threshold=0.15) + print(sr) + assert len(sr) == 2 + assert sr[0].signature == ss_a + assert sr[0].score == 1.0 + assert sr[1].signature == ss_c + assert sr[1].score == 0.2 + + +def test_sqlite_index_scaled1(): + # check on scaled=1 storage. + sqlidx = SqliteIndex.create(":memory:") + + mh1 = sourmash.MinHash(0, 31, scaled=1) + mh1.add_hash(2**64 - 1) + mh1.add_hash(2**64 - 2) + mh1.add_hash(2**64 - 3) + ss1 = sourmash.SourmashSignature(mh1, name='ss 1') + + mh2 = sourmash.MinHash(0, 31, scaled=1) + mh2.add_hash(2**64 - 1) + mh2.add_hash(2**64 - 2) + mh2.add_hash(2**64 - 3) + mh2.add_hash(0) + mh2.add_hash(1) + mh2.add_hash(2) + ss2 = sourmash.SourmashSignature(mh2, name='ss 2') + + sqlidx.insert(ss1) + sqlidx.insert(ss2) + + # check jaccard search + results = list(sqlidx.search(ss1, threshold=0)) + print(results) + assert len(results) == 2 + assert results[0].signature == ss1 + assert results[0].score == 1.0 + assert results[1].signature == ss2 + assert results[1].score == 0.5 + + results = list(sqlidx.search(ss1, threshold=0, do_containment=True)) + print(results) + assert results[0].signature == ss1 + assert results[0].score == 1.0 + assert results[1].signature == ss2 + assert results[1].score == 1.0 + + # minhashes retrieved successfully? + assert len(results[0].signature.minhash) == 3 + assert len(results[1].signature.minhash) == 6 + + +def test_sqlite_index_load_existing(): + # try loading an existing sqlite index + filename = utils.get_test_data('sqlite/index.sqldb') + sqlidx = sourmash.load_file_as_index(filename) + assert isinstance(sqlidx, SqliteIndex) + + siglist = list(sqlidx.signatures()) + assert len(siglist) == 2 + + +def test_sqlite_index_create_load_existing(runtmp): + # try creating then loading an existing sqlite index; create from CLI + filename = runtmp.output('idx.sqldb') + sig1 = utils.get_test_data('47.fa.sig') + sig2 = utils.get_test_data('63.fa.sig') + + runtmp.sourmash('sig', 'cat', sig1, sig2, '-o', filename) + + sqlidx = sourmash.load_file_as_index(filename) + assert isinstance(sqlidx, SqliteIndex) + + siglist = list(sqlidx.signatures()) + assert len(siglist) == 2 + + +def test_sqlite_index_create_load_insert_existing(runtmp): + # try creating, loading, inserting into an existing sqlite index + filename = runtmp.output('idx.sqldb') + sig1 = utils.get_test_data('47.fa.sig') + sig2 = utils.get_test_data('63.fa.sig') + sig3 = utils.get_test_data('2.fa.sig') + + runtmp.sourmash('sig', 'cat', sig1, sig2, '-o', filename) + + sqlidx = sourmash.load_file_as_index(filename) + assert isinstance(sqlidx, SqliteIndex) + + siglist = list(sqlidx.signatures()) + assert len(siglist) == 2 + + ss3 = sourmash.load_one_signature(sig3, ksize=31) + sqlidx.insert(ss3) + sqlidx.commit() + + runtmp.sourmash('sig', 'describe', filename) + print(runtmp.last_result.out) + assert "md5: f3a90d4e5528864a5bcc8434b0d0c3b1" in runtmp.last_result.out + + +def test_sqlite_index_create_load_insert_existing_cli(runtmp): + # try creating, loading, inserting into an existing sqlite index from cli + # (aka "append" to existing database) + filename = runtmp.output('idx.sqldb') + sig1 = utils.get_test_data('47.fa.sig') + sig2 = utils.get_test_data('63.fa.sig') + sig3 = utils.get_test_data('2.fa.sig') + + runtmp.sourmash('sig', 'cat', sig1, sig2, '-o', filename) + + sqlidx = sourmash.load_file_as_index(filename) + assert isinstance(sqlidx, SqliteIndex) + + siglist = list(sqlidx.signatures()) + assert len(siglist) == 2 + + # add a third + runtmp.sourmash('sig', 'cat', sig3, '-o', filename, '-k', '31') + + siglist = list(sqlidx.signatures()) + assert len(siglist) == 3 + + +def test_sqlite_manifest_bad_version(runtmp): + # create a sqlite database with a bad manifest version in the + # sourmash_internal table, see what happens :) + + dbfile = runtmp.output('xyz.sqlmf') + conn = sqlite3.connect(dbfile) + c = conn.cursor() + + SqliteCollectionManifest._create_tables(c) + + # 0.9 doesn't exist/bad version + c.execute('UPDATE sourmash_internal SET value=? WHERE key=?', + ('0.9', 'SqliteManifest')) + + conn.commit() + + with pytest.raises(IndexNotSupported): + mf = CollectionManifest.load_from_filename(dbfile) + + +def test_sqlite_manifest_bad_version_unique(runtmp): + # try to insert duplicate sqlite manifest info into sourmash_internal; fail + + dbfile = runtmp.output('xyz.sqldb') + conn = sqlite3.connect(dbfile) + c = conn.cursor() + + SqliteCollectionManifest._create_tables(c) + + # can't insert duplicate key + with pytest.raises(sqlite3.IntegrityError): + c.execute('INSERT INTO sourmash_internal (value, key) VALUES (?, ?)', + ('1.1', 'SqliteManifest')) + + +def test_sqlite_manifest_basic(): + # test some features of the SQLite-based manifest. + sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) + sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) + sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + + sqlidx = SqliteIndex.create(":memory:") + + # empty manifest tests + manifest = sqlidx.manifest + assert not manifest + assert len(manifest) == 0 + + sqlidx.insert(sig47) + sqlidx.insert(sig63) + + # ok, more full manifest tests! + assert manifest + assert len(manifest) == 2 + + assert sig47 in manifest + assert sig2 not in manifest + + # check that we can get a "standard" manifest out + standard_mf = CollectionManifest.load_from_manifest(sqlidx.manifest) + assert len(standard_mf) == 2 + + picklist = manifest.to_picklist() + assert sig47 in picklist + assert sig2 not in picklist + + +def test_sqlite_manifest_round_trip(): + # check that we can go from regular mf -> sqlite mf -> regular again. + sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) + sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) + sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + + rows = [] + rows.append(CollectionManifest.make_manifest_row(sig47, None, + include_signature=False)) + rows.append(CollectionManifest.make_manifest_row(sig63, None, + include_signature=False)) + nosql_mf = CollectionManifest(rows) + + sqlite_mf = SqliteCollectionManifest.load_from_manifest(nosql_mf) + + # test roundtrip + round_mf = CollectionManifest.load_from_manifest(sqlite_mf) + + assert len(round_mf) == 2 + print(round_mf.rows, nosql_mf.rows) + assert round_mf == nosql_mf + + for mf in (nosql_mf, sqlite_mf, round_mf): + picklist = mf.to_picklist() + assert sig47 in picklist + assert sig2 not in picklist + + +def test_sqlite_manifest_create(runtmp): + # test creation and summarization of a manifest of prot.zip + zipfile = utils.get_test_data('prot/all.zip') + + # create manifest + runtmp.sourmash('sig', 'manifest', '-F', 'sql', zipfile, + '-o', 'mf.sqlmf') + + sqlmf = runtmp.output('mf.sqlmf') + assert os.path.exists(sqlmf) + + # verify it's loadable as the right type + idx = load_sqlite_index(sqlmf) + assert isinstance(idx, StandaloneManifestIndex) + + # summarize + runtmp.sourmash('sig', 'fileinfo', 'mf.sqlmf') + + out = runtmp.last_result.out + print(out) + + assert "2 sketches with dayhoff, k=19, scaled=100 7945 total hashes" in out + assert "2 sketches with hp, k=19, scaled=100 5184 total hashes" in out + assert "2 sketches with protein, k=19, scaled=100 8214 total hashes" in out + assert "1 sketches with DNA, k=31, scaled=1000 5238 total hashes" in out + + assert "path filetype: StandaloneManifestIndex" in out + assert "location: mf.sqlmf" in out + assert "is database? yes" in out + assert "has manifest? yes" in out + assert "num signatures: 7" in out + + +def test_sqlite_manifest_create_noload_sigs(runtmp): + # sigs should not be loadable from manifest this way... + zipfile = utils.get_test_data('prot/all.zip') + + # create manifest + runtmp.sourmash('sig', 'manifest', '-F', 'sql', zipfile, + '-o', 'mf.sqlmf') + + # 'describe' should not be able to load the sqlmf b/c prefix is wrong + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('sig', 'describe', 'mf.sqlmf') + + +def test_sqlite_manifest_create_yesload_sigs(runtmp): + # should be able to load after copying files + zipfile = utils.get_test_data('prot/all.zip') + shutil.copytree(utils.get_test_data('prot'), runtmp.output('prot')) + + # create manifest + runtmp.sourmash('sig', 'manifest', '-F', 'sql', zipfile, + '-o', 'prot/mf.sqlmf') + + # 'describe' should now be able to load the sqlmf, which is cool + runtmp.sourmash('sig', 'describe', 'prot/mf.sqlmf') + print(runtmp.last_result.out) + + +def test_sqlite_manifest_num(runtmp): + # should be able to produce sql manifests with 'num' sketches in them + numsig = utils.get_test_data('num/47.fa.sig') + + # create mf + runtmp.sourmash('sig', 'manifest', '-F', 'sql', numsig, + '-o', 'mf.sqlmf') + + # do summarize: + runtmp.sourmash('sig', 'summarize', 'mf.sqlmf') + out = runtmp.last_result.out + + print(out) + + assert "1 sketches with DNA, k=21, num=500 500 total hashes" in out + assert "1 sketches with DNA, k=31, num=500 500 total hashes" in out + assert "1 sketches with DNA, k=51, num=500 500 total hashes" in out + + +def test_sqlite_manifest_num_select(runtmp): + # should be able to _select_ sql manifests with 'num' sketches in them + numsig = utils.get_test_data('num/47.fa.sig') + + # create mf + runtmp.sourmash('sig', 'manifest', '-F', 'sql', numsig, + '-o', 'mf.sqlmf') + + # load as index + idx = sourmash.load_file_as_index(runtmp.output('mf.sqlmf')) + + # select + print(list(idx.manifest.rows)) + idx = idx.select(num=500) + print(list(idx.manifest.rows)) + assert len(idx) == 3 + + +def test_sqlite_manifest_locations(runtmp): + # check what locations returns... may return too many, that's ok. + prot = utils.get_test_data('prot') + + runtmp.sourmash('sig', 'manifest', '-F', 'sql', prot, + '-o', 'mf.sqlmf') + + # load as index + idx = sourmash.load_file_as_index(runtmp.output('mf.sqlmf')) + + picklist = SignaturePicklist('identprefix') + picklist.pickset = set(['GCA_001593925']) + idx = idx.select(picklist=picklist) + + sql_locations = set(idx.manifest.locations()) + row_locations = set(row['internal_location'] for row in idx.manifest.rows) + + assert sql_locations.issuperset(row_locations) + + assert 'dna-sig.sig.gz' in sql_locations # this is unnecessary... + assert 'dna-sig.sig.gz' not in row_locations # ...this is correct :) + + +def test_sqlite_manifest_create_insert(runtmp): + # try out creating a sqlite manifest and then running cli on it + + mfname = runtmp.output("some.sqlmf") + mf = SqliteCollectionManifest.create(mfname) + + sigfile = utils.get_test_data('47.fa.sig') + ss = sourmash.load_one_signature(sigfile) + + mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, 'some.sig')) + mf.conn.commit() + + # copy sig in since we want it to resolve... + shutil.copyfile(sigfile, runtmp.output('some.sig')) + + # 'describe' should work here, to resolve actual sigs. + runtmp.sourmash('sig', 'describe', mfname) + print(runtmp.last_result.out) + assert 'md5: 09a08691ce52952152f0e866a59f6261' in runtmp.last_result.out + + +def test_sqlite_manifest_create_insert_2(runtmp): + # try out creating a sqlite manifest from cli and then _insert_row into it + + # copy sig in since we want it to resolve... + sigfile = utils.get_test_data('47.fa.sig') + shutil.copyfile(sigfile, runtmp.output('some.sig')) + + runtmp.sourmash('sig', 'manifest', 'some.sig', '-F', 'sql', + '-o', 'some.sqlmf') + mfname = runtmp.output("some.sqlmf") + + mf = CollectionManifest.load_from_filename(mfname) + ss = sourmash.load_one_signature(runtmp.output('some.sig')) + mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, 'some.sig')) + mf.conn.commit() + + # 'describe' should work here, to resolve actual sigs. + runtmp.sourmash('sig', 'describe', mfname) + print(runtmp.last_result.out) + assert 'md5: 09a08691ce52952152f0e866a59f6261' in runtmp.last_result.out + + +def test_sqlite_manifest_existing(runtmp): + # try out an existing sqlite manifest + + prefix = runtmp.output('protdir') + mf = runtmp.output('protdir/prot.sqlmf') + shutil.copytree(utils.get_test_data('prot'), prefix) + shutil.copyfile(utils.get_test_data('sqlite/prot.sqlmf'), mf) + + runtmp.sourmash('sig', 'describe', mf) + print(runtmp.last_result.out) + + +def test_sqlite_manifest_existing_insert(runtmp): + # try out an existing sqlite manifest - insert into it + + prefix = runtmp.output('protdir') + shutil.copytree(utils.get_test_data('prot'), prefix) + + mfname = runtmp.output('protdir/prot.sqlmf') + shutil.copyfile(utils.get_test_data('sqlite/prot.sqlmf'), mfname) + mf = CollectionManifest.load_from_filename(mfname) + assert isinstance(mf, SqliteCollectionManifest) + + sigfile = utils.get_test_data('47.fa.sig') + ss = sourmash.load_one_signature(sigfile) + + mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, 'some.sig')) + mf.conn.commit() + + # copy sig in since we want it to resolve... + shutil.copyfile(sigfile, runtmp.output('protdir/some.sig')) + + # 'describe' should work here. + runtmp.sourmash('sig', 'describe', mfname) + print(runtmp.last_result.out) + + +def test_sqlite_manifest_existing_mf_only(runtmp): + # try out an existing sqlite manifest, but without underlying files -> fail + + mf = runtmp.output('prot.sqlmf') + shutil.copyfile(utils.get_test_data('sqlite/prot.sqlmf'), mf) + + # 'fileinfo' should work... + runtmp.sourmash('sig', 'fileinfo', mf) + print(runtmp.last_result.out) + assert 'num signatures: 7' in runtmp.last_result.out + + # ...but 'describe' should fail, since it needs actual sigs. + with pytest.raises(SourmashCommandFailed) as exc: + runtmp.sourmash('sig', 'describe', mf) + + print(runtmp.last_result.err) + assert 'ERROR: Error while reading signatures from' in runtmp.last_result.err + + +def test_sqlite_manifest_existing_mfonly_insert(runtmp): + # try out an existing sqlite manifest - insert into it, but fail describe + + mfname = runtmp.output('prot.sqlmf') + shutil.copyfile(utils.get_test_data('sqlite/prot.sqlmf'), mfname) + mf = CollectionManifest.load_from_filename(mfname) + assert isinstance(mf, SqliteCollectionManifest) + + sigfile = utils.get_test_data('47.fa.sig') + ss = sourmash.load_one_signature(sigfile) + + mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, sigfile)) + mf.conn.commit() + + # 'fileinfo' should work... + runtmp.sourmash('sig', 'fileinfo', mfname) + print(runtmp.last_result.out) + assert 'num signatures: 8' in runtmp.last_result.out + + # ...but 'describe' should fail, since it needs actual sigs. + with pytest.raises(SourmashCommandFailed) as exc: + runtmp.sourmash('sig', 'describe', mfname) + + +def test_sqlite_manifest_load_existing_index(): + # try loading an existing sqlite index as a manifest + filename = utils.get_test_data('sqlite/index.sqldb') + mf = CollectionManifest.load_from_filename(filename) + assert isinstance(mf, SqliteCollectionManifest) + + assert len(mf) == 2 + + +def test_sqlite_manifest_load_existing_index_insert_fail(): + # try loading an existing sqlite index as a manifest; insert should fail + filename = utils.get_test_data('sqlite/index.sqldb') + mf = CollectionManifest.load_from_filename(filename) + assert isinstance(mf, SqliteCollectionManifest) + + assert len(mf) == 2 + + # try insert - should fail + sigfile = utils.get_test_data('47.fa.sig') + ss = sourmash.load_one_signature(sigfile) + + with pytest.raises(Exception) as exc: + mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, sigfile)) + + assert "must use SqliteIndex.insert to add to this manifest" in str(exc) + + +def test_sqlite_lca_db_load_existing(): + # try loading an existing sqlite index + filename = utils.get_test_data('sqlite/lca.sqldb') + sqlidx = sourmash.load_file_as_index(filename) + assert isinstance(sqlidx, LCA_SqliteDatabase) + + siglist = list(sqlidx.signatures()) + assert len(siglist) == 2 + + +def test_sqlite_lca_db_create_load_existing(runtmp): + # try creating (from CLI) then loading (from API) an LCA db + filename = runtmp.output('lca.sqldb') + sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') + + runtmp.sourmash('sig', 'flatten', sig1, sig2, '-o', filename, '-k', '31') + + # load tax + tax_csv = utils.get_test_data('sqlite/delmont-6.csv') + runtmp.sourmash('tax', 'prepare', '-t', tax_csv, + '-o', filename, '-F', 'sql') + + sqlidx = sourmash.load_file_as_index(filename) + assert isinstance(sqlidx, LCA_SqliteDatabase) + + siglist = list(sqlidx.signatures()) + assert len(siglist) == 2 + + +def test_sqlite_lca_db_load_empty(runtmp): + # try creating then loading an _empty_ LCA_SqliteDatabase + + dbname = runtmp.output('empty.sqldb') + + # create empty SqliteIndex... + runtmp.sourmash('sig', 'cat', '-o', dbname) + assert os.path.exists(dbname) + + # ...and create empty sourmash_taxonomy tables in there... + empty_tax = utils.get_test_data('scaled/empty-lineage.csv') + runtmp.sourmash('tax', 'prepare', '-F', 'sql', '-t', empty_tax, + '-o', dbname) + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('sig', 'describe', dbname) + + +def test_sqlite_lca_db_try_load_sqlite_index(): + # try loading a SqliteIndex with no tax tables from .load classmethod + dbname = utils.get_test_data('sqlite/index.sqldb') + + with pytest.raises(ValueError) as exc: + db = LCA_SqliteDatabase.load(dbname) + + assert "not a taxonomy database" in str(exc) + + +def test_sqlite_lca_db_supply_lineage_db(): + # try creating an LCA_SqliteDatabase object with a separate lineage DB. + dbname = utils.get_test_data('sqlite/index.sqldb') + + tax_csv = utils.get_test_data('sqlite/shewanella-lineage.csv') + lineage_db = MultiLineageDB.load([tax_csv]) + + db = LCA_SqliteDatabase(dbname, lineage_db=lineage_db) + + hashval = next(iter(db.hashvals)) + lineages = db.get_lineage_assignments(hashval) + print(lineages) + assert lineages[0][0].rank == 'superkingdom' + assert lineages[0][0].name == 'd__Bacteria' + assert lineages[0][-1].rank == 'species' + assert lineages[0][-1].name == 's__Shewanella baltica' + assert lineages[1][0].rank == 'superkingdom' + assert lineages[1][0].name == 'd__Bacteria' + assert lineages[0][-1].rank == 'species' + assert lineages[0][-1].name == 's__Shewanella baltica' + + +def test_bad_sqlite_internal_version(): + # check get_sourmash_internal + dbname = utils.get_test_data('sqlite/index.sqldb') + + conn = sqlite_utils.open_sqlite_db(dbname) + c = conn.cursor() + with pytest.raises(Exception): + sqlite_utils.add_sourmash_internal(c, 'SqliteIndex', '0.9') diff --git a/tests/test_tax.py b/tests/test_tax.py index 93702847bb..1faf6ce19a 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -9,6 +9,9 @@ from sourmash.tax import tax_utils from sourmash_tst_utils import SourmashCommandFailed +from sourmash import sqlite_utils +from sourmash.exceptions import IndexNotSupported + ## command line tests def test_run_sourmash_tax(): status, out, err = utils.runscript('sourmash', ['tax'], fail_ok=True) @@ -1819,3 +1822,31 @@ def test_tax_prepare_3_db_to_csv_empty_ranks_3(runtmp): keep_identifier_versions=False) assert set(db1) == set(db2) assert set(db1) == set(db3) + + +def test_tax_prepare_sqlite_lineage_version(runtmp): + # test bad sourmash_internals version for SqliteLineage + taxcsv = utils.get_test_data('tax/test.taxonomy.csv') + taxout = runtmp.output('out.db') + + runtmp.run_sourmash('tax', 'prepare', '-t', taxcsv, + '-o', taxout, '-F', 'sql') + assert os.path.exists(taxout) + + # set bad version + conn = sqlite_utils.open_sqlite_db(taxout) + c = conn.cursor() + c.execute("UPDATE sourmash_internal SET value='0.9' WHERE key='SqliteLineage'") + + conn.commit() + conn.close() + + with pytest.raises(IndexNotSupported): + db = tax_utils.MultiLineageDB.load([taxout]) + +def test_tax_prepare_sqlite_no_lineage(): + # no lineage table at all + sqldb = utils.get_test_data('sqlite/index.sqldb') + + with pytest.raises(ValueError): + db = tax_utils.MultiLineageDB.load([sqldb]) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index dce1d6d9c2..449e26f972 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -967,6 +967,17 @@ def test_tax_multi_load_files(runtmp): MultiLineageDB.load([runtmp.output('no-such-file')]) +def test_tax_sql_load_new_file(runtmp): + # test loading a newer-format sql file with sourmash_internals table + taxonomy_db = utils.get_test_data('sqlite/test.taxonomy.db') + + db = MultiLineageDB.load([taxonomy_db]) + print(list(db.keys())) + assert len(db) == 6 + assert 'strain' not in db.available_ranks + assert db['GCF_001881345'][0].rank == 'superkingdom' + + def test_tax_multi_load_files_shadowed(runtmp): # test loading various good and bad files taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')