Skip to content

Commit

Permalink
Add thin (incomplete) LastAlign BioContainer API
Browse files Browse the repository at this point in the history
  • Loading branch information
kMutagene committed Sep 16, 2019
1 parent e679406 commit 3304e61
Show file tree
Hide file tree
Showing 3 changed files with 339 additions and 3 deletions.
3 changes: 2 additions & 1 deletion src/BioFSharp.BioTools/BioFSharp.BioTools.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@
<Compile Include="Hera.fs" />
<Compile Include="HMMER.fs" />
<Compile Include="IntaRNA.fs" />
<Compile Include="LastAlign.fs" />
<Compile Include="MoFF.fs" />
<Compile Include="TargetP.fs" />
<Compile Include="Tmhmm.fs" />
<Compile Include="MoFF.fs" />
<None Include="BioFSharp.BioTools.fsx" />
<None Include="paket.references" />
<None Include="paket.template" />
Expand Down
45 changes: 43 additions & 2 deletions src/BioFSharp.BioTools/BioFSharp.BioTools.fsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#r "netstandard"
#r "../../packages/Newtonsoft.Json.10.0.3/lib/netstandard1.3/Newtonsoft.Json.dll"
#r "../../packages/Newtonsoft.Json.11.0.2/lib/netstandard2.0/Newtonsoft.Json.dll"
#r "../../packages/System.Buffers/lib/netstandard2.0/System.Buffers.dll"
#r "../../packages/Docker.DotNet/lib/netstandard2.0/Docker.DotNet.dll"

Expand All @@ -16,6 +16,7 @@
#load "Blast.fs"
#load "ClustalO.fs"
#load "HMMER.fs"
#load "LastAlign.fs"

open System.Threading
open System.Threading
Expand Down Expand Up @@ -492,4 +493,44 @@ let hmmbuildParamz =
OutputHMMFile @"C:\Users\Kevin\source\repos\CsbScaffold\Docker\data\hmmer_testfiles\testOutput.hmm"
]

runHMMbuild hmmerContext hmmbuildParamz
runHMMbuild hmmerContext hmmbuildParamz



let imageLastAlign = Docker.ImageName "last-align"

let lastAlignContext =
BioContainer.initBcContextWithMountAsync client imageLastAlign @"C:\Users\kevin\Desktop\Microbiology_CrossGenomics\Data\Genomes"
|> Async.RunSynchronously


open LastAlign

let lastDBParameters =
[
LastDBParams.Input
(InputOptions.Single @"C:\Users\kevin\Desktop\Microbiology_CrossGenomics\Data\Genomes\GCF_000091205.1_ASM9120v1_genomic.fna")
LastDBParams.OutputName
@"C:\Users\kevin\Desktop\Microbiology_CrossGenomics\Data\Genomes\CyanidioschyzonDB"

]

runLastDBAsync lastAlignContext lastDBParameters
|> Async.RunSynchronously

BioContainer.disposeAsync lastAlignContext |> Async.RunSynchronously

let alignParams =
[
LastAlignParameters.DBName
@"C:\Users\kevin\Desktop\Microbiology_CrossGenomics\Data\Genomes\GaldieriaDB"
LastAlignParameters.Query
(QueryOptions.SingleFastaFile @"C:\Users\kevin\Desktop\Microbiology_CrossGenomics\Data\Genomes\GCF_000091205.1_ASM9120v1_genomic.fna")
LastAlignParameters.OutputFileName
@"C:\Users\kevin\Desktop\Microbiology_CrossGenomics\Data\Genomes\GenomeAlignment.maf"
LastAlignParameters.Verbose
]

runLastAlignAsync lastAlignContext alignParams
|> Async.RunSynchronously
|> fun x -> File.WriteAllLines(@"C:\Users\kevin\Desktop\Microbiology_CrossGenomics\Data\Genomes\GenomeAlignment.maf",x.Split([|"\r\n";"\r";"\n"|],StringSplitOptions.None))
294 changes: 294 additions & 0 deletions src/BioFSharp.BioTools/LastAlign.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
namespace BioFSharp.BioTools

///!! Currently many parameters are not implemented!
///
///From the Last docs at http://last.cbrc.jp:
///
///LAST finds similar regions between sequences.
///
///LAST can:
///
///Handle big sequence data, e.g:
///
/// - Compare two vertebrate genomes
///
/// - Align billions of DNA reads to a genome
///
///Indicate the reliability of each aligned column.
///
///Use sequence quality data properly.
///
///Compare DNA to proteins, with frameshifts.
///
///Compare PSSMs to sequences
///
///Calculate the likelihood of chance similarities between random sequences.
///
///Do split and spliced alignment.
///
///Train alignment parameters for unusual kinds of sequence (e.g. nanopore).
//TO-DO: Full Parameter support

module LastAlign =

open BioContainer

//Usage: lastdb [options] output-name fasta-sequence-file(s)
//Prepare sequences for subsequent alignment with lastal.

//Main Options:
//-h, --help: show all options and their default settings, and exit
//-p: interpret the sequences as proteins
//-R: repeat-marking options (default=10)
//-c: soft-mask lowercase letters (in reference *and* query sequences)
//-u: seeding scheme (default: YASS for DNA, else exact-match seeds)

//Advanced Options (default settings):
//-w: use initial matches starting at every w-th position in each sequence (1)
//-W: use "minimum" positions in sliding windows of W consecutive positions (1)
//-S: strand: 0=reverse, 1=forward, 2=both (1)
//-s: volume size (unlimited)
//-Q: input format: 0=fasta or fastq-ignore,
// 1=fastq-sanger, 2=fastq-solexa, 3=fastq-illumina (fasta)
//-P: number of parallel threads (1)
//-m: seed pattern
//-a: user-defined alphabet
//-i: minimum limit on initial matches per query position (0)
//-b: bucket depth
//-C: child table type: 0=none, 1=byte-size, 2=short-size, 3=full (0)
//-x: just count sequences and letters
//-v: be verbose: write messages about what lastdb is doing
//-V, --version: show version information, and exit

type SeedingSchemeOptions =
| YASS
| ExactMatch

static member make =
function
| YASS -> ["-u";"YASS"]
| ExactMatch -> ["-u";"exact-match"]

type InputOptions =
|Single of string
|Multiple of string list

static member make =
function
|Single f -> [f]
|Multiple fl -> fl

static member makeWith (m:MountInfo) =
let cPath p = (MountInfo.containerPathOf m p)
function
|Single f -> [cPath f]
|Multiple fl -> fl |> List.map cPath

type LastDBParams =
|Input of InputOptions
|OutputName of string
///-p: interpret the sequences as proteins
|InputAsProteins
///-R: repeat-marking options (default=10)
|RepeatMarking of int
///-c: soft-mask lowercase letters (in reference *and* query sequence
|LowercaseSoftMasking
///-u: seeding scheme (default: YASS for DNA, else exact-match seeds)
|SeedingScheme of SeedingSchemeOptions

static member make =
function
|Input i -> InputOptions.make i
|OutputName f -> [f]
|InputAsProteins -> ["-p"]
|RepeatMarking rm -> ["-R";string rm]
|LowercaseSoftMasking -> ["-c"]
|SeedingScheme sm -> SeedingSchemeOptions.make sm

static member makeCmdWith (m:MountInfo) =
let cPath p = (MountInfo.containerPathOf m p)
function
|Input i -> InputOptions.makeWith m i
|OutputName f -> [cPath f]
|InputAsProteins -> ["-p"]
|RepeatMarking rm -> ["-R";string rm]
|LowercaseSoftMasking -> ["-c"]
|SeedingScheme sm -> SeedingSchemeOptions.make sm

let runLastDBAsync (bcContext:BioContainer.BcContext) (opt:LastDBParams list) =
//Usage: lastdb [options] output-name fasta-sequence-file(s)
let input =
opt
|> List.filter (fun p -> match p with |Input _ -> true |_ -> false)
|> fun x -> if List.isEmpty x then
failwith "no input sequence given"
else
LastDBParams.makeCmdWith bcContext.Mount x.[0]

let output =
opt
|> List.filter (fun p -> match p with |OutputName _ -> true |_ -> false)
|> fun x -> if List.isEmpty x then
failwith "no output sequence given"
else
LastDBParams.makeCmdWith bcContext.Mount x.[0]

let options = opt |> List.filter (fun p -> match p with |Input _ |OutputName _ -> false |_ -> true)
let cmds = (options |> List.map (LastDBParams.makeCmdWith bcContext.Mount))
let tp = ("lastdb"::(cmds |> List.concat)@output@input)

printfn "Starting process lastdb\r\nparameters:"
cmds |> List.iter (fun op -> printfn "\t%s" (String.concat " " op))

async {
let! res = BioContainer.execAsync bcContext tp
return res

}


//Usage: lastal [options] lastdb-name fasta-sequence-file(s)

//Find and align similar sequences.
//Cosmetic options:
//-h, --help: show all options and their default settings, and exit
//-V, --version: show version information, and exit
//-v: be verbose: write messages about what lastal is doing
//-f: output format: TAB, MAF, BlastTab, BlastTab+ (default=MAF)

//E-value options (default settings):
//-D: query letters per random alignment (1e+06)
//-E: maximum expected alignments per square giga (1e+18/D/refSize/numOfStrands)
//-r: match score (2 if -M, else 6 if 0<Q<5, else 1 if DNA)
//-q: mismatch cost (3 if -M, else 18 if 0<Q<5, else 1 if DNA)
//-p: match/mismatch score matrix (protein-protein: BL62, DNA-protein: BL80)
//-a: gap existence cost (DNA: 7, protein: 11, 0<Q<5: 21)
//-b: gap extension cost (DNA: 1, protein: 2, 0<Q<5: 9)
//-A: insertion existence cost (a)
//-B: insertion extension cost (b)
//-c: unaligned residue pair cost (off)
//-F: frameshift cost (off)
//-x: maximum score drop for preliminary gapped alignments (z)
//-y: maximum score drop for gapless alignments (min[t*10, x])
//-z: maximum score drop for final gapped alignments (e-1)
//-d: minimum score for gapless alignments (min[e, t*ln(1000*refSize/n)])
//-e: minimum score for gapped alignments

//Initial-match options (default settings):
//-m: maximum initial matches per query position (10)
//-l: minimum length for initial matches (1)
//-L: maximum length for initial matches (infinity)
//-k: use initial matches starting at every k-th position in each query (1)
//-W: use "minimum" positions in sliding windows of W consecutive positions

//Miscellaneous options (default settings):
//-s: strand: 0=reverse, 1=forward, 2=both (2 for DNA, 1 for protein)
//-S: score matrix applies to forward strand of: 0=reference, 1=query (0)
//-K: omit alignments whose query range lies in >= K others with > scor12e (off)
//-C: omit gapless alignments in >= C others with > score-per-length (off)
//-P: number of parallel threads (1)
//-i: query batch size (8 KiB, unless there is > 1 thread or lastdb volume)
//-M: find minimum-difference alignments (faster but cruder)
//-T: type of alignment: 0=local, 1=overlap (0)
//-n: maximum gapless alignments per query position (infinity if m=0, else m)
//-N: stop after the first N alignments per query strand
//-R: repeat-marking options (the same as was used for lastdb)
//-u: mask lowercase during extensions: 0=never, 1=gapless,2=gapless+postmask, 3=always (2 if lastdb -c and Q<5, else 0)
//-w: suppress repeats inside exact matches, offset by <= this distance (1000)
//-G: genetic code file
//-t: 'temperature' for calculating probabilities (1/lambda)
//-g: 'gamma' parameter for gamma-centroid and LAMA (1)
//-j: output type: 0=match counts, 1=gapless, 2=redundant gapped, 3=gapped,4=column ambiguity estimates, 5=gamma-centroid, 6=LAMA, 7=expected counts (3)
//-Q: input format: 0=fasta or fastq-ignore, 1=fastq-sanger, 2=fastq-solexa,3=fastq-illumina, 4=prb, 5=PSSM (fasta)

type OutputFormatOptions =
|TAB
|MAF
|BlastTab
|BlastTabPlus

static member make =
function
|TAB -> ["-f";"TAB"]
|MAF -> ["-f";"MAF"]
|BlastTab -> ["-f";"BlastTab"]
|BlastTabPlus -> ["-f";"BlastTab+"]

type QueryOptions =
|SingleFastaFile of string
|MultipleFilePattern of string

static member make =
function
|SingleFastaFile s -> [s]
|MultipleFilePattern m -> [m]

static member makeWith (m:MountInfo) =
let cPath p = (MountInfo.containerPathOf m p)
function
|SingleFastaFile s -> [cPath s]
|MultipleFilePattern m -> [cPath m]

type LastAlignParameters =
|DBName of string
|Query of QueryOptions
|OutputFileName of string
|OutputFormat of OutputFormatOptions
|Verbose

static member makeCmd =
function
|DBName dbn -> [dbn]
|Query q -> QueryOptions.make q
|OutputFileName on -> [">"; on]
|OutputFormat outfmt -> OutputFormatOptions.make outfmt
|Verbose -> ["-v"]

static member makeCmdWith (m:MountInfo) =
let cPath p = (MountInfo.containerPathOf m p)
function
|DBName dbn -> [cPath dbn]
|Query q -> QueryOptions.makeWith m q
|OutputFileName on -> [">" ;(cPath on)]
|OutputFormat outfmt -> OutputFormatOptions.make outfmt
|Verbose -> ["-v"]

let runLastAlignAsync (bcContext:BioContainer.BcContext) (opt:LastAlignParameters list) =
//Usage: lastal [options] lastdb-name fasta-sequence-file(s)

let db =
opt
|> List.filter (fun p -> match p with |DBName _ -> true |_ -> false)
|> fun x -> if List.isEmpty x then
failwith "no search DB given"
else
LastAlignParameters.makeCmdWith bcContext.Mount x.[0]
let input =
opt
|> List.filter (fun p -> match p with |Query _ -> true |_ -> false)
|> fun x -> if List.isEmpty x then
failwith "no query given"
else
LastAlignParameters.makeCmdWith bcContext.Mount x.[0]

let output =
opt
|> List.filter (fun p -> match p with |OutputFileName _ -> true |_ -> false)
|> fun x -> if List.isEmpty x then
failwith "no output name given"
else
LastAlignParameters.makeCmdWith bcContext.Mount x.[0]

let options = opt |> List.filter (fun p -> match p with |DBName _ |Query _ |OutputFileName _-> false |_ -> true)
let cmds = (options |> List.map (LastAlignParameters.makeCmdWith bcContext.Mount))
let tp = ("lastal"::(cmds |> List.concat)@db@input@output)

printfn "Starting process lastal\r\nparameters:"
cmds |> List.iter (fun op -> printfn "\t%s" (String.concat " " op))

async {
let! res = BioContainer.execReturnAsync bcContext tp
return res

}

0 comments on commit 3304e61

Please sign in to comment.