-
Notifications
You must be signed in to change notification settings - Fork 32
/
OntologyEnrichment.fs
162 lines (136 loc) · 7.97 KB
/
OntologyEnrichment.fs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
namespace BioFSharp.Stats
open System
open System.Collections.Generic
open FSharpAux
module OntologyEnrichment =
/// Represents an item in an ontology set
type OntologyItem<'a> = {
Id : string
OntologyTerm : string
GroupIndex : int
Item : 'a
}
/// Creates an item in an ontology set
let createOntologyItem id ontologyTerm groupIndex item =
{Id = id; OntologyTerm = ontologyTerm; GroupIndex = groupIndex; Item = item}
/// Represents a gene set enrichment result
type GseaResult<'a> = {
///Ontology term e.g. MapMan term, GO term ...
OntologyTerm : string
///Sequence of single items associated to the ontology term
ItemsInBin : seq<OntologyItem<'a>>
///Number of significantly altered items in 'OntologyTerm' bin
NumberOfDEsInBin : int
///Number of items in 'OntologyTerm' bin
NumberInBin : int
///Number of significantly altered items within the total data set
TotalNumberOfDE : int
///Number of all items (expanded)
TotalUniverse : int
PValue : float
}
/// Creates a gene set enrichment result
let createGseaResult ontologyTerm desInBin numberOfDEsInBin numberInBin totalNumberOfDE totalUnivers pValue =
{OntologyTerm = ontologyTerm;ItemsInBin = desInBin; NumberOfDEsInBin = numberOfDEsInBin;
NumberInBin = numberInBin; TotalNumberOfDE = totalNumberOfDE; TotalUniverse = totalUnivers; PValue = pValue}
///Splits an OntologyEntry with seperator concatenated TermIds
let splitMultipleAnnotationsBy (separator:char) (item:OntologyItem<'A>) =
let annotations = item.OntologyTerm.Split(separator)
annotations
|> Seq.map (fun ot -> {item with OntologyTerm = ot})
/// Splits MapMan OntologyEntries with seperator concatenated TermIds
/// Attention: Also parses string to int to get rid of 0 - terms
let splitMapManOntologyItemsBy (separator:char) (data:seq<OntologyItem<'a>>) =
let splitTerm (termId:string) (separator:char) =
termId.Split(separator)
|> Array.map (fun sTerm ->
let splited = sTerm.Split('.')
let toInt = splited |> Seq.map (fun v -> Int32.Parse(v).ToString())
toInt |> String.concat "."
)
data
|> Seq.collect (fun oi ->
splitTerm oi.OntologyTerm separator
|> Seq.map (fun sTerm -> createOntologyItem oi.Id sTerm oi.GroupIndex oi.Item)
)
/// Extends leaf OntologyEntries to their full tree
let expandOntologyTree (data:seq<OntologyItem<'a>>) =
data
|> Seq.collect (fun oi ->
let expandenTermIds = oi.OntologyTerm.Split('.') |> Array.scanReduce (fun acc elem -> acc + "." + elem)
expandenTermIds |> Seq.map (fun sTerm -> createOntologyItem oi.Id sTerm oi.GroupIndex oi.Item)
)
// ###########################################################################################################
// the hypergeometric distribution is a discrete probability distribution that describes the probability of
// k successes in
// n draws from a finite
// x population of size containing
// m successes without replacement (successes states)
/// Calculates p value based on hypergeometric distribution (pValue <= k)
let CalcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE (splitPvalueThreshold:int) =
if (numberOfDEsInBin > 1) then
let hp = FSharp.Stats.Distributions.Discrete.hypergeometric totalUnivers totalNumberOfDE numberInBin
if numberInBin > splitPvalueThreshold then
// Calculate normal pValue
1. - hp.CDF (float (numberOfDEsInBin - 1))
else
// Calculate split pValue
0.5 * ((1. - hp.CDF(float(numberOfDEsInBin - 1)) ) + ( (1. - hp.CDF(float(numberOfDEsInBin))) ) )
else
nan
// #######################################################
// functional term enrichment is calculated according to following publication
// http://bioinformatics.oxfordjournals.org/cgi/content/abstract/23/4/401
// also includes mid-pValues
/// Calculates functional term enrichment
let CalcSimpleOverEnrichment (deGroupIndex:int) (splitPvalueThreshold:option<int>) (data:seq<OntologyItem<'a>>) =
let _splitPvalueThreshold = defaultArg splitPvalueThreshold 5
let totalUnivers = data |> Seq.length
let totalNumberOfDE = data |> Seq.filter (fun oi -> oi.GroupIndex = deGroupIndex) |> Seq.length
// returns (DE count, all count)
let countDE (subSet:seq<OntologyItem<'a>>) =
let countMap =
subSet
|> Seq.countBy (fun oi -> if oi.GroupIndex = deGroupIndex then true else false )
|> Map.ofSeq
(countMap.TryFindDefault 0 true,(countMap.TryFindDefault 0 true) + (countMap.TryFindDefault 0 false))
data
|> Seq.groupBy ( fun oi -> oi.OntologyTerm)
|> Seq.map (fun (oTerm,values) ->
let numberOfDEsInBin,numberInBin = countDE values
let pValue = CalcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE _splitPvalueThreshold
createGseaResult oTerm values numberOfDEsInBin numberInBin totalNumberOfDE totalUnivers pValue)
// #######################################################
// functional term enrichment is calculated according to following publication
// http://bioinformatics.oxfordjournals.org/cgi/content/abstract/23/4/401
// also includes mid-pValues
/// Calculates functional term enrichment
let CalcOverEnrichment (deGroupIndex:int) (splitPvalueThreshold:option<int>) (minNumberInTerm:option<int>) (data:seq<OntologyItem<'a>>) =
let _splitPvalueThreshold = defaultArg splitPvalueThreshold 5
let _minNumberInTerm = defaultArg minNumberInTerm 2
// Distinct by term and gene name
// Has to be done by an ouside function
//let distinctData = data |> Seq.distinctBy (fun o -> o.displayID)
let gData = data |> Seq.groupBy ( fun o -> o.OntologyTerm)
// reduce to terms at least annotated with 2 items
let fData = gData |> Seq.filter ( fun (key:string,values:seq<OntologyItem<'a>>) -> Seq.length(values) >= _minNumberInTerm)
let groupCount = fData |> Seq.collect (fun (key:string,values:seq<OntologyItem<'a>>) -> values ) |> Seq.countBy (fun o -> o.GroupIndex)
let totalUnivers = groupCount |> Seq.fold (fun (acc:int) (index:int,count:int) -> acc + count) 0
let totalNumberOfDE =
let tmp = groupCount |> Seq.tryFind (fun (key,v) -> key = deGroupIndex)
if tmp.IsNone then
raise (System.ArgumentException("DE group index does not exists in ontology entry"))
else
snd(tmp.Value)
// returns (DE count, all count)
let countDE (subSet:seq<OntologyItem<'a>>) =
let countMap =
subSet
|> Seq.countBy (fun (oi) -> oi.GroupIndex = deGroupIndex)
|> Map.ofSeq
(countMap.TryFindDefault 0 true,(countMap.TryFindDefault 0 true) + (countMap.TryFindDefault 0 false))
fData
|> Seq.map (fun (oTerm,values) ->
let numberOfDEsInBin,numberInBin = countDE values
let pValue = CalcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE _splitPvalueThreshold
createGseaResult oTerm values numberOfDEsInBin numberInBin totalNumberOfDE totalUnivers pValue)