sourmash-bio · ctb · Jun 12, 2020 · Jun 5, 2020 · Jun 6, 2020 · Jun 6, 2020
diff --git a/sourmash/cli/lca/index.py b/sourmash/cli/lca/index.py
@@ -1,6 +1,6 @@
 """create LCA database"""
 
-from sourmash.cli.utils import add_ksize_arg
+from sourmash.cli.utils import add_ksize_arg, add_moltype_args
 
 
 def subparser(subparsers):
@@ -15,6 +15,7 @@ def subparser(subparsers):
         '--scaled', metavar='S', default=10000, type=float
     )
     add_ksize_arg(subparser, 31)
+    add_moltype_args(subparser)
     subparser.add_argument(
         '-q', '--quiet', action='store_true',
         help='suppress non-error output'

diff --git a/sourmash/lca/command_index.py b/sourmash/lca/command_index.py
@@ -148,6 +148,11 @@ def index(args):
     if args.ksize is None:
         args.ksize = DEFAULT_LOAD_K
 
+    moltype = sourmash_args.calculate_moltype(args, default='DNA')
+
+    notify('Building LCA database with ksize={} scaled={} moltype={}.',
+           args.ksize, args.scaled, moltype)
+
     # first, load taxonomy spreadsheet
     delimiter = ','
     if args.tabs:
@@ -163,7 +168,7 @@ def index(args):
     notify('{} distinct lineages in spreadsheet out of {} rows.',
            len(set(assignments.values())), num_rows)
 
-    db = LCA_Database(args.ksize, args.scaled)
+    db = LCA_Database(args.ksize, args.scaled, moltype)
 
 #    notify('finding signatures...')
     if args.traverse_directory:
@@ -192,9 +197,10 @@ def index(args):
     n_skipped = 0
     for filename in inp_files:
         n += 1
-        for sig in load_signatures(filename, ksize=args.ksize):
+        for sig in load_signatures(filename, ksize=args.ksize,
+                                   select_moltype=moltype):
             notify(u'\r\033[K', end=u'')
-            notify('\r... loading signature {} (file {} of {}); skipped {} so far', sig.name()[:30], n, total_n, n_skipped, end='')
+            notify('\r... loading signature {} ({} of {}); skipped {} so far', sig.name()[:30], n, total_n, n_skipped, end='')
             debug(filename, sig.name())
 
             # block off duplicates.
@@ -208,7 +214,10 @@ def index(args):
             # parse identifier, potentially with splitting
             ident = sig.name()
             if args.split_identifiers: # hack for NCBI-style names, etc.
-                ident = ident.split(' ')[0].split('.')[0]
+                # split on space...
+                ident = ident.split(' ')[0]
+                # ...and on period.
+                ident = ident.split('.')[0]
 
             lineage = assignments.get(ident)
 
@@ -221,26 +230,25 @@ def index(args):
             # add the signature into the database.
             db.insert(sig, ident=ident, lineage=lineage)
 
-            # remove from our list of remaining lineages
-            try:
+            if lineage:
+                # remove from our list of remaining ident -> lineage
                 record_remnants.remove(ident)
-            except KeyError:
-                # @CTB
-                pass
 
-            # track ident as used
-            record_used_idents.add(ident)
+                # track ident as used
+                record_used_idents.add(ident)
+                record_used_lineages.add(lineage)
 
             # track lineage info - either no lineage, or this lineage used.
-            if lineage is None:
+            else:
                 debug('WARNING: no lineage assignment for {}.', ident)
                 record_no_lineage.add(ident)
-            else:
-                record_used_lineages.add(lineage)
 
     # end main add signatures loop
 
-    notify(u'\r\033[K', end=u'')
+    if n_skipped:
+        notify('... loaded {} signatures; skipped {} because of --require-taxonomy.', total_n, n_skipped)
+    else:
+        notify('... loaded {} signatures.', total_n)
 
     # check -- did we find any signatures?
     if n == 0:
@@ -253,6 +261,8 @@ def index(args):
     if not db.hashval_to_idx:
         error('ERROR: no hash values found - are there any signatures?')
         sys.exit(1)
+    notify('loaded {} hashes at ksize={} scaled={}', len(db.hashval_to_idx),
+           args.ksize, args.scaled)
 
     # summarize:
     notify('{} assigned lineages out of {} distinct lineages in spreadsheet.',
@@ -261,6 +271,8 @@ def index(args):
 
     notify('{} identifiers used out of {} distinct identifiers in spreadsheet.',
            len(record_used_idents), len(set(assignments)))
+
+    assert record_used_idents.issubset(set(assignments))
     unused_identifiers = set(assignments) - record_used_idents
 
     # now, save!
@@ -282,7 +294,7 @@ def index(args):
             notify('WARNING: no lineage provided for {} signatures.',
                    len(record_no_lineage))
         if record_remnants:
-            notify('WARNING: no signatures for {} lineage assignments.',
+            notify('WARNING: no signatures for {} spreadsheet rows.',
                    len(record_remnants))
         if unused_lineages:
             notify('WARNING: {} unused lineages.', len(unused_lineages))

diff --git a/sourmash/lca/lca_db.py b/sourmash/lca/lca_db.py
@@ -56,10 +56,11 @@ class LCA_Database(Index):
     `hashval_to_idx` is a dictionary from individual hash values to sets of
     `idx`.
     """
-    def __init__(self, ksize, scaled):
+    def __init__(self, ksize, scaled, moltype='DNA'):
         self.ksize = int(ksize)
         self.scaled = int(scaled)
         self.filename = None
+        self.moltype = moltype
 
         self._next_index = 0
         self._next_lid = 0
@@ -119,6 +120,9 @@ def insert(self, sig, ident=None, lineage=None):
         if minhash.ksize != self.ksize:
             raise ValueError("cannot insert signature with ksize {} into DB (ksize {})".format(minhash.ksize, self.ksize))
 
+        if minhash.moltype != self.moltype:
+            raise ValueError("cannot insert signature with moltype {} into DB (moltype {})".format(minhash.moltype, self.moltype))
+
         # downsample to specified scaled; this has the side effect of
         # making sure they're all at the same scaled value!
         minhash = minhash.downsample_scaled(self.scaled)
@@ -152,6 +156,8 @@ def insert(self, sig, ident=None, lineage=None):
         for hashval in minhash.get_mins():
             self.hashval_to_idx[hashval].add(idx)
 
+        return len(minhash)
+
     def __repr__(self):
         return "LCA_Database('{}')".format(self.filename)
 
@@ -166,7 +172,7 @@ def select(self, ksize=None, moltype=None):
         ok = True
         if ksize is not None and self.ksize != ksize:
             ok = False
-        if moltype is not None and moltype != 'DNA':
+        if moltype is not None and moltype != self.moltype:
             ok = False
 
         if ok:
@@ -204,13 +210,15 @@ def load(cls, db_name):
             if db_type != 'sourmash_lca':
                 raise ValueError("database file '{}' is not an LCA db.".format(db_name))
 
-            if version != '2.0' or 'lid_to_lineage' not in load_d:
+            version = float(version)
+            if version < 2.0 or 'lid_to_lineage' not in load_d:
                 raise ValueError("Error! This is an old-style LCA DB. You'll need to rebuild or download a newer one.")
 
             ksize = int(load_d['ksize'])
             scaled = int(load_d['scaled'])
+            moltype = load_d.get('moltype', 'DNA')
 
-            db = cls(ksize, scaled)
+            db = cls(ksize, scaled, moltype)
 
             # convert lineage_dict to proper lineages (tuples of LineagePairs)
             lid_to_lineage_2 = load_d['lid_to_lineage']
@@ -258,11 +266,12 @@ def save(self, db_name):
         with xopen(db_name, 'wt') as fp:
             # use an OrderedDict to preserve output order
             save_d = OrderedDict()
-            save_d['version'] = '2.0'
+            save_d['version'] = '2.1'
             save_d['type'] = 'sourmash_lca'
             save_d['license'] = 'CC0'
             save_d['ksize'] = self.ksize
             save_d['scaled'] = self.scaled
+            save_d['moltype'] = self.moltype
 
             # convert lineage internals from tuples to dictionaries
             d = OrderedDict()
@@ -387,8 +396,17 @@ def _signatures(self):
         "Create a _signatures member dictionary that contains {idx: sigobj}."
         from sourmash import MinHash, SourmashSignature
 
-        # CTB: if we wanted to support protein/other minhashes, do it here.
-        minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled)
+        is_protein = False
+        is_hp = False
+        is_dayhoff = False
+        if self.moltype == 'protein':
+            is_protein = True
+        elif self.moltype == 'hp':
+            is_hp = True
+        elif self.moltype == 'dayhoff':
+            is_dayhoff = True
+        minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled,
+                          is_protein=is_protein, hp=is_hp, dayhoff=is_dayhoff)
 
         debug('creating signatures for LCA DB...')
         mhd = defaultdict(minhash.copy_and_clear)

diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py
@@ -97,14 +97,7 @@ def describe(args):
     for (sig, signature_file) in siglist:
         mh = sig.minhash
         ksize = mh.ksize
-        moltype = 'DNA'
-        if mh.is_protein:
-            if mh.dayhoff:
-                moltype = 'dayhoff'
-            elif mh.hp:
-                moltype = 'hp'
-            else:
-                moltype = 'protein'
+        moltype = mh.moltype
         scaled = mh.scaled
         num = mh.num
         seed = mh.seed
@@ -169,9 +162,7 @@ def overlap(args):
     md5_2 = sig2.md5sum()
 
     ksize = sig1.minhash.ksize
-    moltype = 'DNA'
-    if sig1.minhash.is_protein:
-        moltype = 'protein'
+    moltype = sig1.minhash.moltype
 
     num = sig1.minhash.num
     size1 = len(sig1.minhash)

diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py
@@ -34,21 +34,25 @@ def get_moltype(sig, require=False):
 
 
 def calculate_moltype(args, default=None):
-    if args.protein:
-        if args.dna is True:
-            error('cannot specify both --dna/--rna and --protein!')
-            sys.exit(-1)
-        args.dna = False
-
     moltype = default
+
+    n = 0
     if args.dna:
         moltype = 'DNA'
-    elif args.dayhoff:
+        n += 1
+    if args.dayhoff:
         moltype = 'dayhoff'
-    elif args.hp:
+        n += 1
+    if args.hp:
         moltype = 'hp'
-    elif args.protein:
+        n += 1
+    if args.protein:
         moltype = 'protein'
+        n += 1
+
+    if n > 1:
+        error("cannot specify more than one of --dna/--rna/--protein/--hp/--dayhoff")
+        sys.exit(-1)
 
     return moltype
 

diff --git a/tests/test-data/prot/build.sh b/tests/test-data/prot/build.sh
@@ -0,0 +1,3 @@
+sourmash compute -k 57 *_protein.faa.gz --scaled=100 -f --input-is-protein  --no-dna --outdir protein --protein
+sourmash compute -k 57 *_protein.faa.gz --scaled=100 -f --input-is-protein  --no-dna --outdir hp --hp
+sourmash compute -k 57 *_protein.faa.gz --scaled=100 -f --input-is-protein  --no-dna --outdir dayhoff --dayhoff
diff --git a/tests/test-data/prot/dayhoff.lca.json.gz b/tests/test-data/prot/dayhoff.lca.json.gz
diff --git a/tests/test-data/prot/dayhoff.sbt.zip b/tests/test-data/prot/dayhoff.sbt.zip
diff --git a/tests/test-data/prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig b/tests/test-data/prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig
diff --git a/tests/test-data/prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig b/tests/test-data/prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig
diff --git a/tests/test-data/prot/gtdb-subset-lineages.csv b/tests/test-data/prot/gtdb-subset-lineages.csv
@@ -0,0 +1,3 @@
+accession,gtdb_id,superkingdom,phylum,class,order,family,genus,species
+GCA_001593935,d__Archaea,p__Crenarchaeota,c__Bathyarchaeia,o__B26-1,f__B26-1,g__B26-1,s__B26-1 sp001593935
+GCA_001593925,d__Archaea,p__Crenarchaeota,c__Bathyarchaeia,o__B26-1,f__B26-1,g__B26-1,s__B26-1 sp001593925
diff --git a/tests/test-data/prot/hp.lca.json.gz b/tests/test-data/prot/hp.lca.json.gz
diff --git a/tests/test-data/prot/hp.sbt.zip b/tests/test-data/prot/hp.sbt.zip
diff --git a/tests/test-data/prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig b/tests/test-data/prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig
diff --git a/tests/test-data/prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig b/tests/test-data/prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig
diff --git a/tests/test-data/prot/protein.lca.json.gz b/tests/test-data/prot/protein.lca.json.gz
diff --git a/tests/test-data/prot/protein.sbt.zip b/tests/test-data/prot/protein.sbt.zip
diff --git a/tests/test-data/prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig b/tests/test-data/prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig
diff --git a/tests/test-data/prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig b/tests/test-data/prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig
diff --git a/tests/test-data/protein_781.sig → tests/test-data/protein_bug_781.sig b/tests/test-data/protein_781.sig → tests/test-data/protein_bug_781.sig
diff --git a/tests/test_bugs.py b/tests/test_bugs.py
@@ -3,7 +3,7 @@
 
 def test_bug_781():
     with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('protein_781.sig')
+        testdata1 = utils.get_test_data('protein_bug_781.sig')
 
         status, out, err = utils.runscript('sourmash',
                                            ['compare',

diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py
@@ -786,6 +786,34 @@ def test_sig_describe_1(c):
     for line in expected_output:
         assert line.strip() in out
 
+
+@utils.in_thisdir
+def test_sig_describe_protein(c):
+    # test describe on a singleton protein signature
+    testdata = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
+    c.run_sourmash('sig', 'describe', testdata)
+
+    assert 'k=57 molecule=protein num=0 scaled=100 seed=42 track_abundance=0' in c.last_result.out
+
+
+@utils.in_thisdir
+def test_sig_describe_hp(c):
+    # test describe on a singleton hp signature
+    testdata = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
+    c.run_sourmash('sig', 'describe', testdata)
+
+    assert 'k=57 molecule=hp num=0 scaled=100 seed=42 track_abundance=0' in c.last_result.out
+
+
+@utils.in_thisdir
+def test_sig_describe_dayhoff(c):
+    # test describe on a singleton dayhoff signature
+    testdata = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
+    c.run_sourmash('sig', 'describe', testdata)
+
+    assert 'k=57 molecule=dayhoff num=0 scaled=100 seed=42 track_abundance=0' in c.last_result.out
+
+
 @utils.in_tempdir
 def test_sig_describe_1_hp(c):
     # get basic info on a signature