Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Enable moltypes other than DNA in LCA databases #1013

Merged
merged 22 commits into from
Jun 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion sourmash/cli/lca/index.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""create LCA database"""

from sourmash.cli.utils import add_ksize_arg
from sourmash.cli.utils import add_ksize_arg, add_moltype_args


def subparser(subparsers):
Expand All @@ -15,6 +15,7 @@ def subparser(subparsers):
'--scaled', metavar='S', default=10000, type=float
)
add_ksize_arg(subparser, 31)
add_moltype_args(subparser)
subparser.add_argument(
'-q', '--quiet', action='store_true',
help='suppress non-error output'
Expand Down
44 changes: 28 additions & 16 deletions sourmash/lca/command_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,11 @@ def index(args):
if args.ksize is None:
args.ksize = DEFAULT_LOAD_K

moltype = sourmash_args.calculate_moltype(args, default='DNA')

notify('Building LCA database with ksize={} scaled={} moltype={}.',
args.ksize, args.scaled, moltype)

# first, load taxonomy spreadsheet
delimiter = ','
if args.tabs:
Expand All @@ -163,7 +168,7 @@ def index(args):
notify('{} distinct lineages in spreadsheet out of {} rows.',
len(set(assignments.values())), num_rows)

db = LCA_Database(args.ksize, args.scaled)
db = LCA_Database(args.ksize, args.scaled, moltype)

# notify('finding signatures...')
if args.traverse_directory:
Expand Down Expand Up @@ -192,9 +197,10 @@ def index(args):
n_skipped = 0
for filename in inp_files:
n += 1
for sig in load_signatures(filename, ksize=args.ksize):
for sig in load_signatures(filename, ksize=args.ksize,
select_moltype=moltype):
notify(u'\r\033[K', end=u'')
notify('\r... loading signature {} (file {} of {}); skipped {} so far', sig.name()[:30], n, total_n, n_skipped, end='')
notify('\r... loading signature {} ({} of {}); skipped {} so far', sig.name()[:30], n, total_n, n_skipped, end='')
debug(filename, sig.name())

# block off duplicates.
Expand All @@ -208,7 +214,10 @@ def index(args):
# parse identifier, potentially with splitting
ident = sig.name()
if args.split_identifiers: # hack for NCBI-style names, etc.
ident = ident.split(' ')[0].split('.')[0]
# split on space...
ident = ident.split(' ')[0]
# ...and on period.
ident = ident.split('.')[0]

lineage = assignments.get(ident)

Expand All @@ -221,26 +230,25 @@ def index(args):
# add the signature into the database.
db.insert(sig, ident=ident, lineage=lineage)

# remove from our list of remaining lineages
try:
if lineage:
# remove from our list of remaining ident -> lineage
record_remnants.remove(ident)
except KeyError:
# @CTB
pass

# track ident as used
record_used_idents.add(ident)
# track ident as used
record_used_idents.add(ident)
record_used_lineages.add(lineage)

# track lineage info - either no lineage, or this lineage used.
if lineage is None:
else:
debug('WARNING: no lineage assignment for {}.', ident)
record_no_lineage.add(ident)
else:
record_used_lineages.add(lineage)

# end main add signatures loop

notify(u'\r\033[K', end=u'')
if n_skipped:
notify('... loaded {} signatures; skipped {} because of --require-taxonomy.', total_n, n_skipped)
else:
notify('... loaded {} signatures.', total_n)

# check -- did we find any signatures?
if n == 0:
Expand All @@ -253,6 +261,8 @@ def index(args):
if not db.hashval_to_idx:
error('ERROR: no hash values found - are there any signatures?')
sys.exit(1)
notify('loaded {} hashes at ksize={} scaled={}', len(db.hashval_to_idx),
args.ksize, args.scaled)

# summarize:
notify('{} assigned lineages out of {} distinct lineages in spreadsheet.',
Expand All @@ -261,6 +271,8 @@ def index(args):

notify('{} identifiers used out of {} distinct identifiers in spreadsheet.',
len(record_used_idents), len(set(assignments)))

assert record_used_idents.issubset(set(assignments))
unused_identifiers = set(assignments) - record_used_idents

# now, save!
Expand All @@ -282,7 +294,7 @@ def index(args):
notify('WARNING: no lineage provided for {} signatures.',
len(record_no_lineage))
if record_remnants:
notify('WARNING: no signatures for {} lineage assignments.',
notify('WARNING: no signatures for {} spreadsheet rows.',
len(record_remnants))
if unused_lineages:
notify('WARNING: {} unused lineages.', len(unused_lineages))
Expand Down
32 changes: 25 additions & 7 deletions sourmash/lca/lca_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,11 @@ class LCA_Database(Index):
`hashval_to_idx` is a dictionary from individual hash values to sets of
`idx`.
"""
def __init__(self, ksize, scaled):
def __init__(self, ksize, scaled, moltype='DNA'):
self.ksize = int(ksize)
self.scaled = int(scaled)
self.filename = None
self.moltype = moltype

self._next_index = 0
self._next_lid = 0
Expand Down Expand Up @@ -119,6 +120,9 @@ def insert(self, sig, ident=None, lineage=None):
if minhash.ksize != self.ksize:
raise ValueError("cannot insert signature with ksize {} into DB (ksize {})".format(minhash.ksize, self.ksize))

if minhash.moltype != self.moltype:
raise ValueError("cannot insert signature with moltype {} into DB (moltype {})".format(minhash.moltype, self.moltype))

# downsample to specified scaled; this has the side effect of
# making sure they're all at the same scaled value!
minhash = minhash.downsample_scaled(self.scaled)
Expand Down Expand Up @@ -152,6 +156,8 @@ def insert(self, sig, ident=None, lineage=None):
for hashval in minhash.get_mins():
self.hashval_to_idx[hashval].add(idx)

return len(minhash)

def __repr__(self):
return "LCA_Database('{}')".format(self.filename)

Expand All @@ -166,7 +172,7 @@ def select(self, ksize=None, moltype=None):
ok = True
if ksize is not None and self.ksize != ksize:
ok = False
if moltype is not None and moltype != 'DNA':
if moltype is not None and moltype != self.moltype:
ok = False

if ok:
Expand Down Expand Up @@ -204,13 +210,15 @@ def load(cls, db_name):
if db_type != 'sourmash_lca':
raise ValueError("database file '{}' is not an LCA db.".format(db_name))

if version != '2.0' or 'lid_to_lineage' not in load_d:
version = float(version)
if version < 2.0 or 'lid_to_lineage' not in load_d:
raise ValueError("Error! This is an old-style LCA DB. You'll need to rebuild or download a newer one.")

ksize = int(load_d['ksize'])
scaled = int(load_d['scaled'])
moltype = load_d.get('moltype', 'DNA')

db = cls(ksize, scaled)
db = cls(ksize, scaled, moltype)

# convert lineage_dict to proper lineages (tuples of LineagePairs)
lid_to_lineage_2 = load_d['lid_to_lineage']
Expand Down Expand Up @@ -258,11 +266,12 @@ def save(self, db_name):
with xopen(db_name, 'wt') as fp:
# use an OrderedDict to preserve output order
save_d = OrderedDict()
save_d['version'] = '2.0'
save_d['version'] = '2.1'
save_d['type'] = 'sourmash_lca'
save_d['license'] = 'CC0'
save_d['ksize'] = self.ksize
save_d['scaled'] = self.scaled
save_d['moltype'] = self.moltype

# convert lineage internals from tuples to dictionaries
d = OrderedDict()
Expand Down Expand Up @@ -387,8 +396,17 @@ def _signatures(self):
"Create a _signatures member dictionary that contains {idx: sigobj}."
from sourmash import MinHash, SourmashSignature

# CTB: if we wanted to support protein/other minhashes, do it here.
minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled)
is_protein = False
is_hp = False
is_dayhoff = False
if self.moltype == 'protein':
is_protein = True
elif self.moltype == 'hp':
is_hp = True
elif self.moltype == 'dayhoff':
is_dayhoff = True
minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled,
is_protein=is_protein, hp=is_hp, dayhoff=is_dayhoff)

debug('creating signatures for LCA DB...')
mhd = defaultdict(minhash.copy_and_clear)
Expand Down
13 changes: 2 additions & 11 deletions sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,7 @@ def describe(args):
for (sig, signature_file) in siglist:
mh = sig.minhash
ksize = mh.ksize
moltype = 'DNA'
if mh.is_protein:
if mh.dayhoff:
moltype = 'dayhoff'
elif mh.hp:
moltype = 'hp'
else:
moltype = 'protein'
moltype = mh.moltype
scaled = mh.scaled
num = mh.num
seed = mh.seed
Expand Down Expand Up @@ -169,9 +162,7 @@ def overlap(args):
md5_2 = sig2.md5sum()

ksize = sig1.minhash.ksize
moltype = 'DNA'
if sig1.minhash.is_protein:
moltype = 'protein'
moltype = sig1.minhash.moltype

num = sig1.minhash.num
size1 = len(sig1.minhash)
Expand Down
22 changes: 13 additions & 9 deletions sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,25 @@ def get_moltype(sig, require=False):


def calculate_moltype(args, default=None):
if args.protein:
if args.dna is True:
error('cannot specify both --dna/--rna and --protein!')
sys.exit(-1)
args.dna = False

moltype = default

n = 0
if args.dna:
moltype = 'DNA'
elif args.dayhoff:
n += 1
if args.dayhoff:
moltype = 'dayhoff'
elif args.hp:
n += 1
if args.hp:
moltype = 'hp'
elif args.protein:
n += 1
if args.protein:
moltype = 'protein'
n += 1

if n > 1:
error("cannot specify more than one of --dna/--rna/--protein/--hp/--dayhoff")
sys.exit(-1)

return moltype

Expand Down
3 changes: 3 additions & 0 deletions tests/test-data/prot/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sourmash compute -k 57 *_protein.faa.gz --scaled=100 -f --input-is-protein --no-dna --outdir protein --protein
sourmash compute -k 57 *_protein.faa.gz --scaled=100 -f --input-is-protein --no-dna --outdir hp --hp
sourmash compute -k 57 *_protein.faa.gz --scaled=100 -f --input-is-protein --no-dna --outdir dayhoff --dayhoff
Binary file added tests/test-data/prot/dayhoff.lca.json.gz
Binary file not shown.
Binary file added tests/test-data/prot/dayhoff.sbt.zip
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions tests/test-data/prot/gtdb-subset-lineages.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
accession,gtdb_id,superkingdom,phylum,class,order,family,genus,species
GCA_001593935,d__Archaea,p__Crenarchaeota,c__Bathyarchaeia,o__B26-1,f__B26-1,g__B26-1,s__B26-1 sp001593935
GCA_001593925,d__Archaea,p__Crenarchaeota,c__Bathyarchaeia,o__B26-1,f__B26-1,g__B26-1,s__B26-1 sp001593925
Binary file added tests/test-data/prot/hp.lca.json.gz
Binary file not shown.
Binary file added tests/test-data/prot/hp.sbt.zip
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Binary file added tests/test-data/prot/protein.lca.json.gz
Binary file not shown.
Binary file added tests/test-data/prot/protein.sbt.zip
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

File renamed without changes.
2 changes: 1 addition & 1 deletion tests/test_bugs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

def test_bug_781():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('protein_781.sig')
testdata1 = utils.get_test_data('protein_bug_781.sig')

status, out, err = utils.runscript('sourmash',
['compare',
Expand Down
28 changes: 28 additions & 0 deletions tests/test_cmd_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,6 +786,34 @@ def test_sig_describe_1(c):
for line in expected_output:
assert line.strip() in out


@utils.in_thisdir
def test_sig_describe_protein(c):
# test describe on a singleton protein signature
testdata = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
c.run_sourmash('sig', 'describe', testdata)

assert 'k=57 molecule=protein num=0 scaled=100 seed=42 track_abundance=0' in c.last_result.out


@utils.in_thisdir
def test_sig_describe_hp(c):
# test describe on a singleton hp signature
testdata = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
c.run_sourmash('sig', 'describe', testdata)

assert 'k=57 molecule=hp num=0 scaled=100 seed=42 track_abundance=0' in c.last_result.out


@utils.in_thisdir
def test_sig_describe_dayhoff(c):
# test describe on a singleton dayhoff signature
testdata = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
c.run_sourmash('sig', 'describe', testdata)

assert 'k=57 molecule=dayhoff num=0 scaled=100 seed=42 track_abundance=0' in c.last_result.out


@utils.in_tempdir
def test_sig_describe_1_hp(c):
# get basic info on a signature
Expand Down
Loading