Skip to content

Commit

Permalink
add UniProt style FastA Header parser
Browse files Browse the repository at this point in the history
  • Loading branch information
HLWeil committed Apr 29, 2019
1 parent d0f059a commit f2a16aa
Show file tree
Hide file tree
Showing 2 changed files with 173 additions and 82 deletions.
29 changes: 15 additions & 14 deletions .paket/Paket.Restore.targets
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@

<!-- Disable Paket restore under NCrunch build -->
<PaketRestoreDisabled Condition="'$(NCrunch)' == '1'">True</PaketRestoreDisabled>

<PaketIntermediateOutputPath Condition=" '$(PaketIntermediateOutputPath)' == '' ">$(BaseIntermediateOutputPath.TrimEnd('\').TrimEnd('\/'))</PaketIntermediateOutputPath>
</PropertyGroup>

<Target Name="PaketBootstrapping" Condition="Exists('$(PaketToolsPath)paket.bootstrapper.proj')">
Expand Down Expand Up @@ -115,18 +117,18 @@

<!-- Do a global restore if required -->
<Exec Command='$(PaketBootStrapperCommand)' Condition=" '$(PaketBootstrapperStyle)' == 'classic' AND Exists('$(PaketBootStrapperExePath)') AND !(Exists('$(PaketExePath)'))" ContinueOnError="false" />
<Exec Command='$(PaketCommand) restore' Condition=" '$(PaketRestoreRequired)' == 'true' " ContinueOnError="false" />

<Exec Command='$(PaketCommand) restore' Condition=" '$(PaketRestoreRequired)' == 'true' AND '$(PaketDisableGlobalRestore)' != 'true' " ContinueOnError="false" />
<!-- Step 2 Detect project specific changes -->
<ItemGroup>
<MyTargetFrameworks Condition="'$(TargetFramework)' != '' " Include="$(TargetFramework)"></MyTargetFrameworks>
<!-- Don't include all frameworks when msbuild explicitly asks for a single one -->
<MyTargetFrameworks Condition="'$(TargetFrameworks)' != '' AND '$(TargetFramework)' == '' " Include="$(TargetFrameworks)"></MyTargetFrameworks>
<PaketResolvedFilePaths Include="@(MyTargetFrameworks -> '$(MSBuildProjectDirectory)\obj\$(MSBuildProjectFile).%(Identity).paket.resolved')"></PaketResolvedFilePaths>
<PaketResolvedFilePaths Include="@(MyTargetFrameworks -> '$(PaketIntermediateOutputPath)\$(MSBuildProjectFile).%(Identity).paket.resolved')"></PaketResolvedFilePaths>
</ItemGroup>
<Message Importance="low" Text="MyTargetFrameworks=@(MyTargetFrameworks) PaketResolvedFilePaths=@(PaketResolvedFilePaths)" />
<PropertyGroup>
<PaketReferencesCachedFilePath>$(MSBuildProjectDirectory)\obj\$(MSBuildProjectFile).paket.references.cached</PaketReferencesCachedFilePath>
<PaketReferencesCachedFilePath>$(PaketIntermediateOutputPath)\$(MSBuildProjectFile).paket.references.cached</PaketReferencesCachedFilePath>
<!-- MyProject.fsproj.paket.references has the highest precedence -->
<PaketOriginalReferencesFilePath>$(MSBuildProjectFullPath).paket.references</PaketOriginalReferencesFilePath>
<!-- MyProject.paket.references -->
Expand Down Expand Up @@ -161,8 +163,8 @@

<!-- Step 3 Restore project specific stuff if required -->
<Message Condition=" '$(PaketRestoreRequired)' == 'true' " Importance="low" Text="Detected a change ('$(PaketRestoreRequiredReason)') in the project file '$(MSBuildProjectFullPath)', calling paket restore" />
<Exec Command='$(PaketCommand) restore --project "$(MSBuildProjectFullPath)" --target-framework "$(TargetFrameworks)"' Condition=" '$(PaketRestoreRequired)' == 'true' AND '$(TargetFramework)' == '' " ContinueOnError="false" />
<Exec Command='$(PaketCommand) restore --project "$(MSBuildProjectFullPath)" --target-framework "$(TargetFramework)"' Condition=" '$(PaketRestoreRequired)' == 'true' AND '$(TargetFramework)' != '' " ContinueOnError="false" />
<Exec Command='$(PaketCommand) restore --project "$(MSBuildProjectFullPath)" --output-path "$(PaketIntermediateOutputPath)" --target-framework "$(TargetFrameworks)"' Condition=" '$(PaketRestoreRequired)' == 'true' AND '$(TargetFramework)' == '' " ContinueOnError="false" />
<Exec Command='$(PaketCommand) restore --project "$(MSBuildProjectFullPath)" --output-path "$(PaketIntermediateOutputPath)" --target-framework "$(TargetFramework)"' Condition=" '$(PaketRestoreRequired)' == 'true' AND '$(TargetFramework)' != '' " ContinueOnError="false" />

<!-- This shouldn't actually happen, but just to be sure. -->
<PropertyGroup>
Expand Down Expand Up @@ -195,7 +197,7 @@
</ItemGroup>

<PropertyGroup>
<PaketCliToolFilePath>$(MSBuildProjectDirectory)/obj/$(MSBuildProjectFile).paket.clitools</PaketCliToolFilePath>
<PaketCliToolFilePath>$(PaketIntermediateOutputPath)/$(MSBuildProjectFile).paket.clitools</PaketCliToolFilePath>
</PropertyGroup>

<ReadLinesFromFile File="$(PaketCliToolFilePath)" >
Expand All @@ -214,23 +216,22 @@

<!-- Disabled for now until we know what to do with runtime deps - https://github.com/fsprojects/Paket/issues/2964
<PropertyGroup>
<RestoreConfigFile>$(MSBuildProjectDirectory)/obj/$(MSBuildProjectFile).NuGet.Config</RestoreConfigFile>
<RestoreConfigFile>$(PaketIntermediateOutputPath)/$(MSBuildProjectFile).NuGet.Config</RestoreConfigFile>
</PropertyGroup> -->

</Target>

<Target Name="PaketDisableDirectPack" AfterTargets="_IntermediatePack" BeforeTargets="GenerateNuspec" Condition="('$(IsPackable)' == '' Or '$(IsPackable)' == 'true') And Exists('$(MSBuildProjectDirectory)/obj/$(MSBuildProjectFile).references')" >
<Target Name="PaketDisableDirectPack" AfterTargets="_IntermediatePack" BeforeTargets="GenerateNuspec" Condition="('$(IsPackable)' == '' Or '$(IsPackable)' == 'true') And Exists('$(PaketIntermediateOutputPath)/$(MSBuildProjectFile).references')" >
<PropertyGroup>
<ContinuePackingAfterGeneratingNuspec>false</ContinuePackingAfterGeneratingNuspec>
<DetectedMSBuildVersion>$(MSBuildVersion)</DetectedMSBuildVersion>
<DetectedMSBuildVersion Condition="$(MSBuildVersion) == ''">15.8.0</DetectedMSBuildVersion>
</PropertyGroup>
</Target>

<Target Name="PaketOverrideNuspec" AfterTargets="GenerateNuspec" Condition="('$(IsPackable)' == '' Or '$(IsPackable)' == 'true') And Exists('$(MSBuildProjectDirectory)/obj/$(MSBuildProjectFile).references')" >

<Target Name="PaketOverrideNuspec" AfterTargets="GenerateNuspec" Condition="('$(IsPackable)' == '' Or '$(IsPackable)' == 'true') And Exists('$(PaketIntermediateOutputPath)/$(MSBuildProjectFile).references')" >
<ItemGroup>
<_NuspecFilesNewLocation Include="$(BaseIntermediateOutputPath)$(Configuration)\*.nuspec"/>
<_NuspecFilesNewLocation Include="$(PaketIntermediateOutputPath)\$(Configuration)\*.nuspec"/>
<MSBuildMajorVersion Include="$(DetectedMSBuildVersion.Replace(`-`, `.`).Split(`.`)[0])" />
<MSBuildMinorVersion Include="$(DetectedMSBuildVersion.Replace(`-`, `.`).Split(`.`)[1])" />
</ItemGroup>
Expand All @@ -246,8 +247,8 @@
<UseMSBuild15_8_Pack Condition=" '$(NuGetToolVersion)' != '4.0.0' AND (! $(UseMSBuild15_9_Pack)) AND (! $(UseMSBuild16_0_Pack)) ">true</UseMSBuild15_8_Pack>
<UseNuGet4_Pack>false</UseNuGet4_Pack>
<UseNuGet4_Pack Condition=" (! $(UseMSBuild15_8_Pack)) AND (! $(UseMSBuild15_9_Pack)) AND (! $(UseMSBuild16_0_Pack)) ">true</UseNuGet4_Pack>
<AdjustedNuspecOutputPath>$(BaseIntermediateOutputPath)$(Configuration)</AdjustedNuspecOutputPath>
<AdjustedNuspecOutputPath Condition="@(_NuspecFilesNewLocation) == ''">$(BaseIntermediateOutputPath)</AdjustedNuspecOutputPath>
<AdjustedNuspecOutputPath>$(PaketIntermediateOutputPath)\$(Configuration)</AdjustedNuspecOutputPath>
<AdjustedNuspecOutputPath Condition="@(_NuspecFilesNewLocation) == ''">$(PaketIntermediateOutputPath)</AdjustedNuspecOutputPath>
</PropertyGroup>

<ItemGroup>
Expand Down
226 changes: 158 additions & 68 deletions src/BioFSharp/BioID.fs
Original file line number Diff line number Diff line change
Expand Up @@ -125,81 +125,171 @@ module BioID =
// let parseMgiId = Regex.tryEitherParse MgiId "MGI:[0-9]*"


// module FastA =

module FastA =

open System.Text.RegularExpressions

// open FSharp.Care.Regex
// open FSharp.Care.Monads
// open FSharp.Care.Collections

// type FastaHeader<'IdType> = {
// ID : 'IdType
// Description : string
// Info : Map<string,string>
// }
type FastaHeader<'IdType> = {
ID : 'IdType
Description : string
Info : Map<string,string>
}

// let createFastaHeader id description info =
// {ID=id;Description=description;Info=info}
let createFastaHeader id description info =
{ID=id;Description=description;Info=info}


// /// Returns DisplayId of FastA header. None if none present.
// let displayIdOf (header:FastaHeader<_>) =
// header.Info.TryFindDefault "None" "DID"
// /// Returns Aliases of FastA header. None if none present.
// let aliasesOf (header:FastaHeader<_>) =
// header.Info.TryFindDefault "None" "ALS"
// /// Returns DataBaseVersion of FastA header. None if none present.
// let dataBaseVersionOf (header:FastaHeader<_>) =
// header.Info.TryFindDefault "None" "DBV"
// /// Returns Type of UniqueIdentifier of FastA header. None if none present.
// let touOf (header:FastaHeader<_>) =
// header.Info.TryFindDefault "None" "TOU"
// /// Returns SequenceVersion of FastA header. None if none present.
// let sequenceVersionOf (header:FastaHeader<_>) =
// header.Info.TryFindDefault "None" "SV"
// /// Returns OrganismName of FastA header. None if none present.
// let organismNameOf (header:FastaHeader<_>) =
// header.Info.TryFindDefault "None" "OS"
// /// Returns ProteinExistence of FastA header. None if none present.
// let proteinExistenceOf (header:FastaHeader<_>) =
// header.Info.TryFindDefault "None" "PE"
// /// Returns GeneName of FastA header. None if none present.
// let geneNameOf (header:FastaHeader<_>) =
// header.Info.TryFindDefault "None" "GN"
// /// Returns ProteinName of FastA header. None if none present.
// let proteinNameOf (header:FastaHeader<_>) =
// header.Info.TryFindDefault "None" "PN"



// /// Sets DisplayId in FastA header.
// let setDisplayId displayId (header:FastaHeader<_>) =
// {header with Info=header.Info.Add("DID",displayId)}
// /// Sets Aliases in FastA header.
// let setAliases alias (header:FastaHeader<_>) =
// {header with Info=header.Info.Add("ALS",alias)}
// /// Sets DataBaseVersion in FastA header.
// let setDataBaseVersion dbv (header:FastaHeader<_>) =
// {header with Info=header.Info.Add("DBV",dbv)}
// /// Sets Type of UniqueIdentifier in FastA header.
// let setIdType idType (header:FastaHeader<_>) =
// {header with Info=header.Info.Add("TOU",idType)}
// /// Sets SequenceVersion in FastA header.
// let setSequenceVersion sv (header:FastaHeader<_>) =
// {header with Info=header.Info.Add("SV",sv)}
// /// Sets OrganismName in FastA header.
// let setOrganismName os (header:FastaHeader<_>) =
// {header with Info=header.Info.Add("OS",os)}
// /// Sets ProteinExistence in FastA header.
// let setProteinExistence pe (header:FastaHeader<_>) =
// {header with Info=header.Info.Add("PE",pe)}
// /// Sets GeneName in FastA header.
// let setGeneName gn (header:FastaHeader<_>) =
// {header with Info=header.Info.Add("GN",gn)}
// /// Sets ProteinName in FastA header.
// let setProteinName pn (header:FastaHeader<_>) =
// {header with Info=header.Info.Add("PN",pn)}


/// Returns DisplayId of FastA header. None if none present.
let displayIdOf (header:FastaHeader<_>) =
header.Info.TryFindDefault "None" "DID"
/// Returns Aliases of FastA header. None if none present.
let aliasesOf (header:FastaHeader<_>) =
header.Info.TryFindDefault "None" "ALS"
/// Returns DataBaseVersion of FastA header. None if none present.
let dataBaseVersionOf (header:FastaHeader<_>) =
header.Info.TryFindDefault "None" "DBV"
/// Returns Type of UniqueIdentifier of FastA header. None if none present.
let touOf (header:FastaHeader<_>) =
header.Info.TryFindDefault "None" "TOU"
/// Returns SequenceVersion of FastA header. None if none present.
let sequenceVersionOf (header:FastaHeader<_>) =
header.Info.TryFindDefault "None" "SV"
/// Returns OrganismName of FastA header. None if none present.
let organismNameOf (header:FastaHeader<_>) =
header.Info.TryFindDefault "None" "OS"
/// Returns ProteinExistence of FastA header. None if none present.
let proteinExistenceOf (header:FastaHeader<_>) =
header.Info.TryFindDefault "None" "PE"
/// Returns GeneName of FastA header. None if none present.
let geneNameOf (header:FastaHeader<_>) =
header.Info.TryFindDefault "None" "GN"
/// Returns ProteinName of FastA header. None if none present.
let proteinNameOf (header:FastaHeader<_>) =
header.Info.TryFindDefault "None" "PN"



/// Sets DisplayId in FastA header.
let setDisplayId displayId (header:FastaHeader<_>) =
{header with Info=header.Info.Add("DID",displayId)}
/// Sets Aliases in FastA header.
let setAliases alias (header:FastaHeader<_>) =
{header with Info=header.Info.Add("ALS",alias)}
/// Sets DataBaseVersion in FastA header.
let setDataBaseVersion dbv (header:FastaHeader<_>) =
{header with Info=header.Info.Add("DBV",dbv)}
/// Sets Type of UniqueIdentifier in FastA header.
let setIdType idType (header:FastaHeader<_>) =
{header with Info=header.Info.Add("TOU",idType)}
/// Sets SequenceVersion in FastA header.
let setSequenceVersion sv (header:FastaHeader<_>) =
{header with Info=header.Info.Add("SV",sv)}
/// Sets OrganismName in FastA header.
let setOrganismName os (header:FastaHeader<_>) =
{header with Info=header.Info.Add("OS",os)}
/// Sets ProteinExistence in FastA header.
let setProteinExistence pe (header:FastaHeader<_>) =
{header with Info=header.Info.Add("PE",pe)}
/// Sets GeneName in FastA header.
let setGeneName gn (header:FastaHeader<_>) =
{header with Info=header.Info.Add("GN",gn)}
/// Sets ProteinName in FastA header.
let setProteinName pn (header:FastaHeader<_>) =
{header with Info=header.Info.Add("PN",pn)}


/// DisplayId (DID=)
let private displayIdRegex = new Regex(@"(?<=DID=)[^=]*(?= \S*=|$)", options = RegexOptions.Compiled)
/// Aliases (ALS=)
let private AliasesRegex = new Regex(@"(?<=ALS=)[^=]*(?= \S*=|$)", options = RegexOptions.Compiled)
/// DataBaseVersion (DBV=)
let private DataBaseVersionRegex = new Regex(@"(?<=DBV=)[^=]*(?= \S*=|$)", options = RegexOptions.Compiled)
/// Type of UniqueIdentifier (TOU=)
let private TypeOfUniqueIdentifierRegex = new Regex(@"(?<=TOU=)[^=]*(?= \S*=|$)", options = RegexOptions.Compiled)
/// SequenceVersion (SV=)
let private SequenceVersionRegex = new Regex(@"(?<=SV=)[^=]*(?= \S*=|$)", options = RegexOptions.Compiled)
/// OrganismName (OS=)
let private OrganismNameRegex = new Regex(@"(?<=OS=)[^=]*(?= \S*=|$)", options = RegexOptions.Compiled)
/// OrganismIdentifier (OX=)
let private OrganismIdentifierRegex = new Regex(@"(?<=OX=)[^=]*(?= \S*=|$)", options = RegexOptions.Compiled)
/// ProteinExistence (PE=)
let private ProteinExistenceRegex = new Regex(@"(?<=PE=)[^=]*(?= \S*=|$)", options = RegexOptions.Compiled)
/// GeneName (GN=)
let private GeneNameRegex = new Regex(@"(?<=GN=)[^=]*(?= \S*=|$)", options = RegexOptions.Compiled)
/// ProteinName (PN=)
let private ProteinNameRegex = new Regex(@"(?<=PN=)[^=]*(?= \S*=|$)", options = RegexOptions.Compiled)

/// Parse Description without attributes
let private descriptionRegex = new Regex(@"(?<=|)[^=|]*(?= \S*=|$)", options = RegexOptions.Compiled)


/// Returns FastAHeader Object from UniProt style FastAHeader string
///
/// For Reference see: https://www.uniprot.org/help/fasta-headers
let fromString (str:string) =
let rec loop (input:(string*(string-> string option)) list) acc (s:string) =
match input with
| h::t -> let key,parser = h
match parser s with
| Some r -> loop t ((key,r)::acc) s
| None -> loop t acc s
| [] -> (acc)

let matchToResultOption (m : Match) = if m.Success then Some m.Value else None

/// Parse DisplayId (DID=) from string
let parseDisplayId str = displayIdRegex.Match(str) |> matchToResultOption
/// Parse Aliases (ALS=)
let parseAliases str = AliasesRegex.Match(str) |> matchToResultOption
/// Parse DataBaseVersion (DBV=)
let parseDataBaseVersion str = DataBaseVersionRegex.Match(str) |> matchToResultOption
/// Parse Type of UniqueIdentifier (TOU=)
let parseTypeOfUniqueIdentifier str = TypeOfUniqueIdentifierRegex.Match(str) |> matchToResultOption
/// Parse SequenceVersion (SV=)
let parseSequenceVersion str = SequenceVersionRegex.Match(str) |> matchToResultOption
/// Parse OrganismName (OS=)
let parseOrganismName str = OrganismNameRegex.Match(str) |> matchToResultOption
/// Parse OrganismIdentifier (OX=)
let parseOrganismIdentifier str = OrganismIdentifierRegex.Match(str) |> matchToResultOption
/// Parse ProteinExistence (PE=)
let parseProteinExistence str = ProteinExistenceRegex.Match(str) |> matchToResultOption
/// Parse GeneName (GN=)
let parseGeneName str = GeneNameRegex.Match(str) |> matchToResultOption
/// Parse ProteinName (PN=)
let parseProteinName str = ProteinNameRegex.Match(str) |> matchToResultOption

/// Parse Description without attributes
let parseDescription str = descriptionRegex.Match(str).Value

let splitStr = str.Split([|'|'|],3)

let descr,info =
//splitStr.[2].Split(' ')
//|> Array.takeWhile
if splitStr.Length > 1 then
let tmp =
[
"DID",parseDisplayId;
"ALS",parseAliases;
"DBV",parseDataBaseVersion;
"TOU",parseTypeOfUniqueIdentifier;
"SV",parseSequenceVersion;
"OS",parseOrganismName;
"OX",parseOrganismIdentifier
"PE",parseProteinExistence;
"GN",parseGeneName;
"PN",parseProteinName;

]
splitStr.[0] + " " + (parseDescription str),loop tmp [] splitStr.[2]
else
("",[])

createFastaHeader splitStr.[1] (descr.Trim()) (info|> Map.ofList)
//// /// Returns FastaHeader from string
//// let fromString (str:string) =
//// let rec loop (input:(string*(string->Either<(string*string),string>)) list) acc (s:string) =
Expand Down

0 comments on commit f2a16aa

Please sign in to comment.