Skip to content

Commit

Permalink
Add verbose option to Sailent (fixes #74)
Browse files Browse the repository at this point in the history
Add computeOfSAResult to directly compute Sailent of a surprisal analysis result type
  • Loading branch information
kMutagene committed Oct 22, 2019
1 parent e7ecd37 commit 97cd339
Showing 1 changed file with 70 additions and 10 deletions.
80 changes: 70 additions & 10 deletions src/BioFSharp.Stats/SurprisalAnalysisEmpiricalPermutationTest.fs
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,17 @@ module Sailent =

let private createSailentResult raw abs pos neg iter = {RawData=raw; AbsoluteDescriptor=abs; PositiveDescriptor=pos; NegativeDescriptor=neg; BootstrapIterations=iter}

let getDistinctGroups (cons:OntologyItem<float> array) =
let private getDistinctGroups (cons:OntologyItem<float> array) =
[for ann in cons do yield ann.OntologyTerm]
|> List.distinct

//create distribution of iter weight sums for a bin of size binSize
let private bootstrapBin (binSize: int) (weightArray:float[]) (iter: int) =
let private bootstrapBin (verbose:bool) (binSize: int) (weightArray:float[]) (iter: int) =
let steps = iter / 10
let startTime = System.DateTime.Now
let rec loop currentIter resultList =
if verbose && (currentIter % steps = 0) then
printfn "[%i/%i] iterations @%i min" (System.DateTime.Now.Subtract(startTime).Minutes) currentIter iter
if currentIter < iter then
let tmp = Array.shuffleFisherYates weightArray
loop (currentIter+1) ((Array.sum (tmp.[0..binSize-1]))::resultList)
Expand Down Expand Up @@ -67,15 +71,18 @@ module Sailent =
|> List.map (fun (name,binSize,weightSum) -> createSailentCharacterization name (getEmpiricalPvalue testDistributions weightSum binSize) binSize weightSum)

///utility function to prepare a dataset column for SAILENT characterization. The ontology map can be created by using the BioFSharp.BioDB module.
let prepareDataColumn (ontologyMap:Map<string,(string*string)list>) (identifiers: string []) (rawData:float []) =
///
///identifiers: a string array containing the annotations of the data at the same index, used as lookup in the ontology map.
///rawData: feature array of interest, must be same length as annotations.
let prepareDataColumn (ontologyMap:Map<string,(string*string) [] >) (identifiers: string []) (rawData:float []) =

if rawData.Length <> identifiers.Length then
failwithf "data column and identifiers dont have the same length (%i vs %i)" rawData.Length identifiers.Length
else
let annotatedIds =
identifiers
|> Array.map (fun id -> match Map.tryFind id ontologyMap with
|Some ann -> ann |> List.map snd |> Array.ofSeq
|Some ann -> ann |> Array.map snd |> Array.ofSeq
|_ -> [|"35.2"|]
)
rawData
Expand All @@ -86,11 +93,29 @@ module Sailent =
|> Array.map (fun (identifier,annotation,indx,value) -> createOntologyItem identifier annotation indx value)

///utility function to prepare a dataset (in column major form) for SAILENT characterization. The ontology map can be created by using the BioFSharp.BioDB module.
let prepareDataset (ontologyMap:Map<string,(string*string)list>) (identifiers: string []) (rawDataset:float [] []) =
///identifiers: a string array containing the annotations of the data at the same index, used as lookup in the ontology map.
///rawData: feature matrix of interest, columns must have same length as identifiers
let prepareDataset (ontologyMap:Map<string,(string*string) [] >) (identifiers: string []) (rawDataset:float [] []) =
rawDataset
|> Array.map (prepareDataColumn ontologyMap identifiers)

let compute (bootstrapIterations:int) (data: OntologyItem<float> array) =
///Compute SAILENT (Surprisal AnalysIs EmpiricaL pErmutatioN Test) for the given annotated dataset. This empirical test was
///initially designed for the biological application of Surprisal Analysis to test the weight distribution of a given bin of annotations is significantly different than a random distribution
///of the same size given the whole dataset, but it should be applicable to similar types of datasets.
///
///Input:
///
///- verbose: if true, bootstrap iterations and runtime for bootstrapping is printed
///
///- bootstrapIterations: the amount of distributions to sample from the whole dataset to create test distributions for each binsize present in the data
///
///- data: annotated dataset (containing ontology items with the associated feature)
///
///a SAILENT test returns 3 descriptors for the input data:
///Absolute descriptor: test distributions and tests are performed on the absolute values of the dataset
///Negative descriptor: test distributions and tests are performed on the negative values of the dataset only
///Absolute descriptor: test distributions and tests are performed on the positive values of the dataset only
let compute (verbose:bool) (bootstrapIterations:int) (data: OntologyItem<float> array) =

printfn "starting SAILENT characterization"

Expand Down Expand Up @@ -133,7 +158,7 @@ module Sailent =
let absoluteTestDistributions =
[|
for binSize in absoluteBinsizes do
let tmp = bootstrapBin binSize weightArr bootstrapIterations
let tmp = bootstrapBin verbose binSize weightArr bootstrapIterations
yield (binSize,Distributions.Frequency.create (Distributions.Bandwidth.nrd0 tmp) tmp)
|]
|> Map.ofArray
Expand All @@ -142,7 +167,7 @@ module Sailent =
let positiveTestDistributions =
[|
for binSize in positiveBinsizes do
let tmp = bootstrapBin binSize posWeightArr bootstrapIterations
let tmp = bootstrapBin verbose binSize posWeightArr bootstrapIterations
yield (binSize,Distributions.Frequency.create (Distributions.Bandwidth.nrd0 tmp) tmp)
|]
|> Map.ofArray
Expand All @@ -151,7 +176,7 @@ module Sailent =
let negativeTestDistributions =
[|
for binSize in negativeBinsizes do
let tmp = bootstrapBin binSize negWeightArr bootstrapIterations
let tmp = bootstrapBin verbose binSize negWeightArr bootstrapIterations
yield (binSize,Distributions.Frequency.create (Distributions.Bandwidth.nrd0 tmp) tmp)
|]
|> Map.ofArray
Expand All @@ -162,3 +187,38 @@ module Sailent =
let negResults = assignPValues negativeTestDistributions negativeTestTargets

createSailentResult data absResults posResults negResults bootstrapIterations


///Compute SAILENT (Surprisal AnalysIs EmpiricaL pErmutatioN Test) for the given Surprisal Analysis result. This empirical test was
///is designed for the biological application of Surprisal Analysis to test the weight distribution of a given bin of annotations is significantly different than a random distribution
///of the same size given the whole dataset.
///
///Input:
///
///- verbose: if true, bootstrap iterations and runtime for bootstrapping is printed
///
///- ontologyMap: maps identifiers of the data to ontology annotations (can be created using the BioFSharp.BioDB module)
///
///- identifiers: a string array containing the annotations of the data at the same index, used as lookup in the ontology map.
///
///- bootstrapIterations: the amount of distributions to sample from the whole dataset to create test distributions for each binsize present in the data
///
///- saRes: the Surprisal Analysis Result to test
///
///a SAILENT test returns 3 descriptors for each constraint of the Surprisal Nalysis result:
///Absolute descriptor: test distributions and tests are performed on the absolute values of the dataset
///Negative descriptor: test distributions and tests are performed on the negative values of the dataset only
///Absolute descriptor: test distributions and tests are performed on the positive values of the dataset only
let computeOfSARes (verbose:bool) (ontologyMap:Map<string,(string*string) [] >) (identifiers: string []) (bootstrapIterations:int) (saRes:FSharp.Stats.ML.SurprisalAnalysis.SAResult) =
saRes.MolecularPhenotypes
|> Matrix.toJaggedArray
// Matrices are sadly row major =(
|> JaggedArray.transpose
|> prepareDataset ontologyMap identifiers
|> Array.mapi
(fun i p ->
printfn "Sailent of constraint %i" i
compute verbose bootstrapIterations p
)

0 comments on commit 97cd339

Please sign in to comment.