Skip to content

Commit

Permalink
v1.0.1 from webserver
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-raden committed Dec 16, 2024
1 parent 0d0872f commit 667c2fc
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 160 deletions.
24 changes: 5 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,41 +65,27 @@ At the end of the page, you find the [example calls](#sample-calls-on-test-data)
### Generate kmer features with `generate_kmer_features.py`

```
usage: generate_kmer_features.py [-h] --kmers KMERS --fasta FASTA [--threads THREADS]
[--batchsize BATCHSIZE] [--report-counts]
usage: generate_kmer_features.py [-h] --kmers KMERS --fasta FASTA [--report-counts]
[--out-csv OUT_CSV] [--minE-subopt MINE_SUBOPT]
[--minE-intarna MINE_INTARNA]
[--feature-context FEATURE_CONTEXT]
Generate kmer-based count features based on sequence plus RNAsubopt and IntaRNA position-
wise energies.. Sample calls: "python generate_kmer_features.py --kmers "AGA,GC,GGG"
--fasta test.fa --out-csv counts.csv""python generate_kmer_features.py --kmers
"AGA,GC,GGG" --fasta test.fa --out-csv "stdout" --report-counts --minE-subopt -5 --minE-
intarna -2"
wise energies..
Sample calls:
"python generate_kmer_features.py --kmers "AGA,GC,GGG" --fasta test.fa --out-csv counts.csv"
"python generate_kmer_features.py --kmers "AGA,GC,GGG" --fasta test.fa --out-csv "stdout" --report-counts --minE-subopt -5 --minE-intarna -2"
optional arguments:
-h, --help show this help message and exit
--kmers KMERS List of kmers as a comma separated string e.g. "AGG,GA,GG"
--fasta FASTA Sequences to extract features from as a FASTA file
--threads THREADS Number of threads used for processing (default: 1) (WARNING:
threads > 1 will impair stdout prints
--batchsize BATCHSIZE
If the number of processed fasta sequences is greater than batch
size batch processing will be applied. This will lower memory
consumption (default: 10000)
--report-counts Whether to report counts as integer, default is binary
nohit(0)-hit(1)
--out-csv OUT_CSV CSV File name to write counts, pass "stdout" for stdout
--minE-subopt MINE_SUBOPT
Minimum free energy of the position on RNAsubopt result
--minE-intarna MINE_INTARNA
Minimum free energy of the position on IntaRNA result
--feature-context FEATURE_CONTEXT
feature groups (contexts) are to be generated by case-insensitive
single letter a - any context (just k-mer occurrence) s -
unpaired in stable intra-molecular structures h - unpaired in
stable inter-molecular homo-duplex RRIs u - unpaired in both in
(s) and (h)
```

### Fit and predict ML model with `fit_predict.py`
Expand Down
19 changes: 11 additions & 8 deletions src/fit_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def fit_dfs_trains(df_features, df_labels, scikit_model, top_features=None, vali
X = df_selected_features.values
#print("X", X)
#print("Y", Y)
n_splits = 3
n_splits = 2
rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=10, random_state=2222)
#skf = StratifiedKFold(n_splits=3, random_state=None, shuffle=True)
arr_PRFS = list()#numpy.array()
Expand All @@ -109,7 +109,7 @@ def fit_dfs_trains(df_features, df_labels, scikit_model, top_features=None, vali
columns=["Precision","Recall","F1"], index=['metric','std']).round(3)

print("10-rep {}-fold CV:".format(n_splits))
print(df_metrics.to_csv(sep=','))
print(df_metrics)

scikit_model.fit(X, y)
print("Training score: {0:.3f}".format(scikit_model.score(X, y)))
Expand Down Expand Up @@ -167,13 +167,13 @@ def is_valid_file(file_name):
model = pickle.load(inmodel)
else:
if args.model_choice == 'SVM-rbf':
model = svm.SVC(kernel='rbf', gamma='scale', probability=True)
model = svm.SVC(kernel='rbf', gamma='scale', probability=True, class_weight="balanced")
if args.model_choice == 'SVM-linear':
model = svm.SVC(kernel='linear', gamma='scale', probability=True)
model = svm.SVC(kernel='linear', gamma='scale', probability=True, class_weight="balanced")
elif args.model_choice == 'Logistic-liblinear':
model = LogisticRegression(solver='liblinear')
model = LogisticRegression(solver='liblinear', class_weight="balanced")
elif args.model_choice == 'Logistic-lbfgs':
model = LogisticRegression(solver='lbfgs')
model = LogisticRegression(solver='lbfgs', class_weight="balanced")

if args.standardize_scaling:
# build pipe: first standardize by substracting mean and dividing std
Expand All @@ -200,8 +200,11 @@ def is_valid_file(file_name):
# with open(args.out_predict_labels, 'w') as outlabels:
# outlabels.write('predicted_class,prob_{},prob_{}\n'.format(model.classes_[0],model.classes_[1]))
# y_predict.tofile(outlabels,sep="\n")
pd.DataFrame(stacked_arr, index=df_features_predict.index).to_csv(args.out_predict_labels, index=True,
header=['predicted_class','prob_{}'.format(model.classes_[0]),'prob_{}'.format(model.classes_[1])],float_format='%.2F')
df_out_predict = pd.DataFrame(stacked_arr, index=df_features_predict.index)
df_out_predict.sort_values(by=[df_out_predict.columns[1],df_out_predict.columns[2]], ascending=False, inplace=True)
df_out_predict['rank'] = df_out_predict.iloc[:,2].rank(method="first",ascending=False)
df_out_predict.to_csv(args.out_predict_labels, index=True,
header=['predicted_class','prob_{}'.format(model.classes_[0]),'prob_{}'.format(model.classes_[1]),'rank'],float_format='%.3F')

if args.save_model is True:
with open(args.out_model, 'wb') as outf:
Expand Down
154 changes: 21 additions & 133 deletions src/generate_kmer_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,13 @@
import re
import argparse
import os.path
from multiprocessing import Pool
from tempfile import TemporaryDirectory
import pickle
from typing import List
import sys
BINDIR = os.path.dirname(os.path.realpath(__file__))
def find_kmer_hits(sequence, kmer):
return [m.start() for m in re.finditer('(?='+kmer+')', sequence)] # re with look-ahead for overlaps

def call_command (cmd):
p = subprocess.Popen(cmd,shell=True,stdin=None, stdout=PIPE, stderr=PIPE)
p = subprocess.Popen(cmd,shell=True,stdin=None, stdout=PIPE)
(result, error) = p.communicate()
if error:
raise RuntimeError("Error in calling cmd or perl script\ncmd:{}\nstdout:{}\nstderr:{}".format(cmd, result, error))
Expand Down Expand Up @@ -67,50 +63,6 @@ def is_valid_file(file_name):
return os.path.abspath(file_name)
else:
raise FileNotFoundError(os.path.abspath(file_name))

def multicore_wrapper(seq_record, args):
out_csv_str = seq_record.id
print(seq_record.id)

seq_subopt, seq_intarna = get_subopt_intarna_strs(str(seq_record.seq),
minE_subopt=args.minE_subopt,
minE_intarna=args.minE_intarna)
for kmer in kmers_list:
hseq, hsubopt, hintarna, hsubopt_intarna = find_hits_all(str(seq_record.seq),
seq_subopt,
seq_intarna,
kmer)
cseq, csubopt, cintarna, csubopt_intarna = len(hseq), len(
hsubopt), len(hintarna), len(hsubopt_intarna)
array_features = []
if "a" in args.feature_context.lower():
array_features.append(cseq)
if "s" in args.feature_context.lower():
array_features.append(csubopt)
if "h" in args.feature_context.lower():
array_features.append(cintarna)
if "u" in args.feature_context.lower():
array_features.append(csubopt_intarna)

if args.report_counts is True:
out_csv_str += ''.join([',{}'.format(f) for f in array_features])
else:
binary_hits = ['0' if c == 0 else '1' for c in array_features]
out_csv_str += "," + ','.join(binary_hits)
return out_csv_str


def write_pickled_output(files: List[str], outfile: str, csv_header: str):
with open(outfile, "w") as of:
of.write(csv_header)
for file in files:
with open(file, "rb") as handle:
data = pickle.load(handle)
of.write("\n".join(data) + "\n")
del data




if __name__ == '__main__':

Expand All @@ -120,99 +72,35 @@ def write_pickled_output(files: List[str], outfile: str, csv_header: str):

parser.add_argument('--kmers', required=True, type=str, help='List of kmers as a comma separated string e.g. \"AGG,GA,GG\"')
parser.add_argument('--fasta', required=True, type=is_valid_file, help='Sequences to extract features from as a FASTA file')
parser.add_argument('--threads', type=int, default=1, help='Number of threads used for processing (default: 1) (WARNING: threads > 1 will impair stdout prints')
parser.add_argument('--batchsize', type=int, default=10000, help='If the number of processed fasta sequences is greater than batch size batch processing will be applied. This will lower memory consumption (default: 10000)')
parser.add_argument('--report-counts', action='store_true', help='Whether to report counts as integer, default is binary nohit(0)-hit(1)'),
parser.add_argument('--out-csv', type=str, default='stdout', help='CSV File name to write counts, pass "stdout" for stdout ')
parser.add_argument('--minE-subopt', default=-3, type=int, help='Minimum free energy of the position on RNAsubopt result')
parser.add_argument('--minE-intarna', default=-3, type=int, help='Minimum free energy of the position on IntaRNA result')
parser.add_argument('--feature-context', default="ashu", type=str, help='feature groups (contexts) are to be generated by case-insensitive single letter'
+'\n\ta - any context (just k-mer occurrence)'
+'\n\ts - unpaired in stable intra-molecular structures'
+'\n\th - unpaired in stable inter-molecular homo-duplex RRIs'
+'\n\tu - unpaired in both in (s) and (h)')


args = parser.parse_args()
print(args)
out_csv_str = "id"
kmers_list = args.kmers.split(',')
for kmer in kmers_list:
if "a" in args.feature_context.lower():
out_csv_str += ",{}_any".format(kmer)
if "s" in args.feature_context.lower():
out_csv_str += ",{}_intra".format(kmer)
if "h" in args.feature_context.lower():
out_csv_str += ",{}_dimer".format(kmer)
if "u" in args.feature_context.lower():
out_csv_str += ",{}_free".format(kmer)

out_csv_str += ",{}_any,{}_intra,{}_dimer,{}_free".format(kmer,kmer,kmer,kmer)
out_csv_str += '\n'
if args.threads == 1:
for r in SeqIO.parse(args.fasta, format='fasta'):
print(r.id)
out_csv_str += r.id
seq_subopt, seq_intarna = get_subopt_intarna_strs(str(r.seq), minE_subopt=args.minE_subopt, minE_intarna=args.minE_intarna)
for kmer in kmers_list:
hseq, hsubopt, hintarna, hsubopt_intarna = find_hits_all(str(r.seq),seq_subopt,seq_intarna, kmer)
print(kmer, hseq, hsubopt, hintarna, hsubopt_intarna)
cseq, csubopt, cintarna, csubopt_intarna = len(hseq), len(hsubopt), len(hintarna), len(hsubopt_intarna)
array_features = []
if "a" in args.feature_context.lower():
array_features.append(cseq)
if "s" in args.feature_context.lower():
array_features.append(csubopt)
if "h" in args.feature_context.lower():
array_features.append(cintarna)
if "u" in args.feature_context.lower():
array_features.append(csubopt_intarna)

if args.report_counts is True:
out_csv_str += ''.join([',{}'.format(f) for f in array_features])
else:
binary_hits = ['0' if c==0 else '1' for c in array_features]
out_csv_str += ","+','.join(binary_hits)
out_csv_str += '\n'

if args.out_csv == "stdout":
print(out_csv_str)
else:
with open(args.out_csv, 'w') as outfile:
outfile.write(out_csv_str)
else:

calls = []
for seq_record in SeqIO.parse(args.fasta, format='fasta'):
calls.append((seq_record, args))

if args.batchsize < len(calls):
tmp_dir = TemporaryDirectory(prefix="BrainDead")
files = []
batch_calls = [calls[x:x+args.batchsize] for x in range(0, len(calls), args.batchsize)]
for x, batch in enumerate(batch_calls):
with Pool(processes=args.threads) as pool:
outstrings = pool.starmap(multicore_wrapper, batch)
file = os.path.join(tmp_dir.name, f"batch_{x}.pckl")
files.append(file)
with open(file, "wb") as handle:
pickle.dump(outstrings, handle)
write_pickled_output(files=files,
outfile=args.out_csv,
csv_header=out_csv_str)


else:
with Pool(processes=args.threads) as pool:
outstrings = pool.starmap(multicore_wrapper, calls)

out_csv_str += "\n".join(outstrings) + "\n"

if args.out_csv == "stdout":
print(out_csv_str)
for r in SeqIO.parse(args.fasta, format='fasta'):
print(r.id)
out_csv_str += r.id
seq_subopt, seq_intarna = get_subopt_intarna_strs(str(r.seq), minE_subopt=args.minE_subopt, minE_intarna=args.minE_intarna)
for kmer in kmers_list:
hseq, hsubopt, hintarna, hsubopt_intarna = find_hits_all(str(r.seq),seq_subopt,seq_intarna, kmer)
print(kmer, hseq, hsubopt, hintarna, hsubopt_intarna)
cseq, csubopt, cintarna, csubopt_intarna = len(hseq), len(hsubopt), len(hintarna), len(hsubopt_intarna)
if args.report_counts is True:
out_csv_str += ',{},{},{},{}'.format(cseq, csubopt, cintarna, csubopt_intarna)
else:
with open(args.out_csv, 'w') as outfile:
outfile.write(out_csv_str)




binary_hits = ['0' if c==0 else '1' for c in [cseq, csubopt, cintarna, csubopt_intarna]]
out_csv_str += ","+','.join(binary_hits)
out_csv_str += '\n'

if args.out_csv == "stdout":
print(out_csv_str)
else:
with open(args.out_csv, 'w') as outfile:
outfile.write(out_csv_str)

0 comments on commit 667c2fc

Please sign in to comment.