forked from GUDHI/TDA-tutorial
-
Notifications
You must be signed in to change notification settings - Fork 1
/
persistence_statistics.py
118 lines (83 loc) · 3.21 KB
/
persistence_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def hausd_interval(data, level = 0.95, m=-1, B =1000,pairwise_dist = False,
leaf_size = 2,ncores = None):
'''
Subsampling Confidence Interval for the Hausdorff Distance between a
Manifold and a Sample
Fasy et al AOS 2014
Input:
data : a nxd numpy array representing n points in R^d, or a nxn matrix of pairwise distances
m : size of each subsample. If m=-1 then m = n / np.log(n)
B : number of subsamples
level : confidence level
pairwise_dist : if pairwise_dist = True then data is a nxn matrix of pairwise distances
leaf_size : leaf size for KDTree
ncores : number of cores for multiprocessing (if None then the maximum number of cores is used)
Output:
quantile for the Hausdorff distance
'''
import numpy as np
from multiprocessing import Pool
from sklearn.neighbors import KDTree
# sample size
n = np.size(data,0)
# subsample size
if m == -1:
m = int (n / np.log(n))
# Data is an array
if pairwise_dist == False:
# for subsampling
# a reprendre sans shuffle slit
global hauss_dist
def hauss_dist(m):
'''
Distances between the points of data and a random subsample of data of size m
'''
I = np.random.choice(n,m)
Icomp = [item for item in np.arange(n) if item not in I]
tree = KDTree(data[I,],leaf_size=leaf_size)
dist, ind = tree.query(data[Icomp,],k=1)
hdist = max(dist)
return(hdist)
# parrallel computing
with Pool(ncores) as p:
dist_vec = p.map(hauss_dist,[m]*B)
p.close()
dist_vec = [a[0] for a in dist_vec]
# Data is a matrix of pairwise distances
else:
def hauss_dist(m):
'''
Distances between the points of data and a random subsample of data of size m
'''
I = np.random.choice(n,m)
hdist= np.max([np.min(data[I,j]) for j in np.arange(n) if j not in I])
return(hdist)
# parrallel computing
with Pool(ncores) as p:
dist_vec = p.map(hauss_dist, [m]*B)
p.close()
# quantile and confidence band
myquantile = np.quantile(dist_vec, level)
c = 2 * myquantile
return(c)
def truncated_simplex_tree(st,int_trunc=100):
'''
This function return a truncated simplex tree
Input:
st : a simplex tree
int_trunc : number of persistent interval keept per dimension (the largest)
Ouptut:
st_trunc_pers : truncated simplex tree
'''
st.persistence()
dim = st.dimension()
st_trunc_pers = [];
for d in range(dim):
pers_d = st.persistence_intervals_in_dimension(d)
d_l= len(pers_d)
if d_l > int_trunc:
pers_d_trunc = [pers_d[i] for i in range(d_l-int_trunc,d_l)]
else:
pers_d_trunc = pers_d
st_trunc_pers = st_trunc_pers + [(d,(l[0],l[1])) for l in pers_d_trunc]
return(st_trunc_pers)