Skip to content

Commit

Permalink
[WIP] make sure query and database files exist (#440)
Browse files Browse the repository at this point in the history
* make sure query and database files exist

The databases in the lca commands are loaded first which can
take a bit of time (~20s) and means that there is a delay in
finding out if one of the query files has been incorrectly typed.
This small function checks if all of the database and query files
exist before trying to load anything.

* fix merge problems; switch to using 'exists'

* make two calls to check_file_exists

python 2.7 can only expand one list per function call. For compatibility,
split the calls to check_file_exists for both the db and query. When
python 2.7 compatibility is dropped these duplicated calls can be
merged into one

* remove some unnecessary spaces
  • Loading branch information
ctSkennerton authored and ctb committed Mar 11, 2018
1 parent 9041723 commit 41ad6b9
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 11 deletions.
10 changes: 8 additions & 2 deletions sourmash/lca/command_classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
from .. import sourmash_args, load_signatures
from ..logging import notify, error
from . import lca_utils
from .lca_utils import debug, set_debug

from .lca_utils import debug, set_debug, check_files_exist

DEFAULT_THRESHOLD=5 # how many counts of a taxid at min

Expand Down Expand Up @@ -107,6 +106,13 @@ def classify(args):
args.db = [item for sublist in args.db for item in sublist]
args.query = [item for sublist in args.query for item in sublist]

# have to have two calls as python < 3.5 can only have one expanded list
if not check_files_exist(*args.query):
sys.exit(-1)

if not check_files_exist(*args.db):
sys.exit(-1)

# load all the databases
dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)

Expand Down
10 changes: 5 additions & 5 deletions sourmash/lca/command_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,9 @@
from .. import sourmash_args, save_signatures, SourmashSignature
from ..logging import notify, error, print_results
from . import lca_utils
from .lca_utils import debug, set_debug
from .lca_utils import debug, set_debug, check_files_exist
from ..search import format_bp


LCAGatherResult = namedtuple('LCAGatherResult',
'intersect_bp, f_unique_to_query, f_unique_weighted, average_abund, lineage, f_match, name, n_equal_matches')

Expand Down Expand Up @@ -77,7 +76,7 @@ def gather_signature(query_sig, dblist, ignore_abundance):
orig_abunds = { k: 1 for k in query_mins }
sum_abunds = sum(orig_abunds.values())


# collect all mentioned lineage_ids -> md5s, from across the databases
md5_to_lineage = {}
md5_to_name = {}
Expand Down Expand Up @@ -108,7 +107,6 @@ def gather_signature(query_sig, dblist, ignore_abundance):
md5 = lca_db.lineage_id_to_signature[lid]
signature_size = lca_db.lineage_id_counts[lid]
assignments[hashval].add((md5, signature_size))

# none? quit.
if not assignments:
break
Expand Down Expand Up @@ -172,7 +170,6 @@ def gather_signature(query_sig, dblist, ignore_abundance):
yield result, f_unassigned, est_bp, query_mins

## done.



def gather_main(args):
Expand Down Expand Up @@ -203,6 +200,9 @@ def gather_main(args):
if args.debug:
set_debug(args.debug)

if not check_files_exist(args.query, *args.db):
sys.exit(-1)

# load all the databases
dblist, ksize, scaled = lca_utils.load_databases(args.db, None)

Expand Down
10 changes: 8 additions & 2 deletions sourmash/lca/command_summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
from .. import sourmash_args, load_signatures
from ..logging import notify, error, print_results
from . import lca_utils
from .lca_utils import debug, set_debug

from .lca_utils import debug, set_debug, check_files_exist

DEFAULT_THRESHOLD=5

Expand Down Expand Up @@ -89,6 +88,13 @@ def summarize_main(args):
args.db = [item for sublist in args.db for item in sublist]
args.query = [item for sublist in args.query for item in sublist]

# have to have two calls as python < 3.5 can only have one expanded list
if not check_files_exist(*args.query):
sys.exit(-1)

if not check_files_exist(*args.db):
sys.exit(-1)

# load all the databases
dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)

Expand Down
19 changes: 17 additions & 2 deletions sourmash/lca/lca_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import sys
import json
import gzip
from os.path import exists
from collections import OrderedDict, namedtuple, defaultdict, Counter

try: # py2/py3 compat
Expand All @@ -14,13 +15,27 @@
import pprint

from .._minhash import get_max_hash_for_scaled
from ..logging import notify

from ..logging import notify, error

# type to store an element in a taxonomic lineage
LineagePair = namedtuple('LineagePair', ['rank', 'name'])


def check_files_exist(*files):
ret = True
not_found = []
for f in files:
if not exists(f):
not_found.append(f)
ret = False

if len(not_found):
error('Error! Could not find the following files.'
' Make sure the file paths are specified correctly.\n{}'.format('\n'.join(not_found)))

return ret


# ordered list of taxonomic ranks
def taxlist(include_strain=True):
for k in ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
Expand Down

0 comments on commit 41ad6b9

Please sign in to comment.