-
-
Notifications
You must be signed in to change notification settings - Fork 241
/
example_pcm.py
36 lines (28 loc) · 1.34 KB
/
example_pcm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import librosa
import numpy as np
import tensorflow as tf
from deep_speaker.constants import SAMPLE_RATE
from deep_speaker.conv_models import DeepSpeakerModel
from deep_speaker.test import batch_cosine_similarity
# Define the model here.
model = DeepSpeakerModel(pcm_input=True)
# Load the checkpoint.
model.m.load_weights('ResCNN_triplet_training_checkpoint_265.h5', by_name=True)
samples = [
'samples/PhilippeRemy/PhilippeRemy_001.wav',
'samples/PhilippeRemy/PhilippeRemy_002.wav',
'samples/1255-90413-0001.flac',
]
pcm = [librosa.load(x, sr=SAMPLE_RATE, mono=True)[0] for x in samples]
# Crop samples in the center, to fit the smaller audio samples
num_samples = min([len(x) for x in pcm])
pcm = tf.convert_to_tensor(np.stack([x[(len(x) - num_samples) // 2:][:num_samples] for x in pcm]))
# Call the model to get the embeddings of shape (1, 512) for each file.
predict = model.m.predict(pcm)
speaker_similarity = batch_cosine_similarity(predict[0:1], predict[1:])
# Compute the cosine similarity and check that it is higher for the same speaker.
same_speaker_similarity = speaker_similarity[0]
diff_speaker_similarity = speaker_similarity[1]
print('SAME SPEAKER', same_speaker_similarity) # SAME SPEAKER [0.81564593]
print('DIFF SPEAKER', diff_speaker_similarity) # DIFF SPEAKER [0.1419204]
assert same_speaker_similarity > diff_speaker_similarity