-
Notifications
You must be signed in to change notification settings - Fork 85
0.1. Fundamentals
This section documents all the scripts in the Chapter_1_fundamentals folder.
Voice computing aims to develop hardware or software to process voice inputs. Here are some common terms you will encounter in this discipline:
Term | Definition |
---|---|
voice computer | any computerized system (assembled hardware and software) that can process voice inputs. |
voice computing software | can read/write, record, clean, encrypt/decrypt, playback, transcode, transcribe, compress, publish, featurize, model, and visualize voice files. |
voice computing hardware | can include a motherboard, microphones, sound cards (with D/A and A/D converters), central processing units (CPUs), graphics cards, storage devices (e.g. hard disks), computer monitors, WiFi chips, bluetooth chips, radio transmitters, speakers, and a power supply. |
microphone | a transducer that converts sound (e.g. pressure waves in air) into an electrical signal (e.g. amps - C/s). |
sound cards | convert audio from PCM data to various audio formats (e.g. .WAV) through audio codecs. |
codec | software program used to encode and decode digital audio data to and from a digital audio coding format. |
audio coding format | the output file type of a digital signal that has been manipulated by an audio codec program. |
transcoding | the process of converting one audio coding format to another. |
audio channels | the number of audio inputs or outputs of a recorded audio signal. |
speaker | Speakers operate in the reverse way as a microphone, where analog sound is transduced from an electrical signal (e.g. current - amps). |
See getting started.
using various libraries: pydub, wave, librosa, and soundfile.
from pydub import AudioSegment
data = AudioSegment.from_wav("test.wav")
data.export("new_test.wav")
import wave
data=wave.open('test.wav', mode='rb')
params=data.getparams()
# _wave_params(nchannels=1, sampwidth=2, framerate=16000, nframes=47104, comptype='NONE', compname='not compressed')
import librosa
y, sr = librosa.load('test.wav')
librosa.output.write_wav('new_test.wav', y, sr)
from scipy.io import wavfile
fs, data = wavfile.read('test.wav')
wavfile.write('new_test.wav',fs, data)
import soundfile as sf
data, fs = sf.read('test.wav')
sf.write('new_test.ogg', data, fs)
Assumes SoX is installed on the host system.
import os
# take in one.wav and two.wav to make three.wav
os.system('sox one.wav two.wav three.wav')
# take first second of one.wav and output to output.wav
os.system('sox one.wav output.wav trim 0 1')
# make volume 2x in one.wav and output to volup.wav
os.system('sox -v 2.0 one.wav volup.wav')
# make volume ½ in one.wav and output to voldown.wav
os.system('sox -v -0.5 one.wav volup.wav')
# reverse one.wav and output to reverse.wav
os.system('sox one.wav reverse.wav reverse')
# change sample rate of one.wav to 16000 Hz
os.system('sox one.wav -r 16000 sr.wav')
# change audio file to 16 bit quality
os.system('sox -b 16 one.wav 16bit.wav')
# convert mono file to stereo by cloning channels
os.system('sox one.wav -c 2 stereo.wav')
# make stereo file mono by averaging out the channels
os.system('sox stereo.wav -c 1 mono.wav')
# double speed of file
os.system('sox one.wav 2x.wav speed 2.0')
play_sync.py
'''
play_sync.py
Play back an audio file synchronously.
'''
import pygame
def sync_playback(filename):
# takes in a file and plays it back
pygame.mixer.init()
pygame.mixer.music.load(filename)
pygame.mixer.music.play()
sync_playback('one.wav')
play_async.py
import sounddevice as sd
import soundfile as sf
import time
def async_playback(filename):
data, fs = sf.read(filename)
sd.play(data, fs)
return data, fs
# playback file
data, fs = async_playback('play.wav')
# can execute commands
print('able to execute this before finishing')
print('hi, this is cool!')
# can stop after 1 second playing back
time.sleep(1)
sd.stop()
print('stopped')
mic_check.py
import sounddevice as sd
mics=sd.query_devices()
default_devices=sd.default.device
default_input=default_devices[0]
default_output=default_devices[1]
# prints all available devices
for i in range(len(mics)):
print(mics[i])
# can set default device easily with
sounddevice.default.device = 0
sync_record.py
import sounddevice as sd
import soundfile as sf
import time
def sync_record(filename, duration, fs, channels):
print('recording')
myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
sd.wait()
sf.write(filename, myrecording, fs)
print('done recording')
# playback file
sync_record('sync_record.wav', 10, 16000, 1)
async_record.py
import sounddevice as sd
import soundfile as sf
import time
def printstuff(number):
for i in range(number):
print(i)
def async_record(filename, duration, fs, channels):
print('recording')
myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
# can execute commands
print('able to execute this before finishing')
printstuff(30)
# now wait until done before writing to file
sd.wait()
sf.write(filename, myrecording, fs)
print('done recording')
# playback file
async_record('async_record.wav', 10, 16000, 1)
using ffmpy module
convert_wav.py
import ffmpy
def convert_wav(filename):
#take in an audio file and convert with ffmpeg file type
#types of input files: .mp3
#output file type: .wav
if filename[-4:] in ['.mp3','.m4a','.ogg']:
ff = ffmpy.FFmpeg(
inputs={filename:None},
outputs={filename[0:-4]+'.wav': None}
)
ff.run()
convert_wav('test.mp3')
using PocketSphinx
sphinx_transcribe.py
import speech_recognition as sr_audio
import sounddevice as sd
import soundfile as sf
import os, json, datetime
def sync_record(filename, duration, fs, channels):
print('recording')
myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
sd.wait()
sf.write(filename, myrecording, fs)
print('done recording')
def transcribe_audio_sphinx(filename):
# transcribe the audio (note this is only done if a voice sample)
r=sr_audio.Recognizer()
with sr_audio.AudioFile(filename) as source:
audio = r.record(source)
text=r.recognize_sphinx(audio)
print('transcript: '+text)
return text
def store_transcript(filename, transcript):
jsonfilename=filename[0:-4]+'.json'
print('saving %s to current directory'%(jsonfilename))
data = {
'date': str(datetime.datetime.now()),
'filename':filename,
'transcript':transcript,
}
print(data)
jsonfile=open(jsonfilename,'w')
json.dump(data,jsonfile)
jsonfile.close()
# record file and print transcript
filename='sync_record.wav'
sync_record(filename, 10, 16000, 1)
transcript=transcribe_audio_sphinx(filename)
# now write the transcript into a .json file
# e.g. sync_record.wav transcript will be stored in sync_record.json
store_transcript(filename, transcript)
google_transcribe.py
# assumes environment variables are set properly following the Google Speech API documentation
import speech_recognition as sr_audio
import sounddevice as sd
import soundfile as sf
import os, json, datetime
def transcribe_audio_google(filename):
# transcribe the audio (note this is only done if a voice sample)
r=sr_audio.Recognizer()
with sr_audio.AudioFile(filename) as source:
audio = r.record(source)
text=r.recognize_google_cloud(audio)
return text
def sync_record(filename, duration, fs, channels):
print('recording')
myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=channels)
sd.wait()
sf.write(filename, myrecording, fs)
print('done recording')
def store_transcript(filename, transcript):
jsonfilename=filename[0:-4]+'.json'
print('saving %s to current directory'%(jsonfilename))
data = {
'date': str(datetime.datetime.now()),
'filename':filename,
'transcript':transcript,
}
print(data)
jsonfile=open(jsonfilename,'w')
json.dump(data,jsonfile)
jsonfile.close()
# record file and print transcript
filename='google_record.wav'
sync_record(filename, 10, 16000, 1)
transcript=transcribe_audio_google(filename)
# now write the transcript into a .json file
# e.g. sync_record.wav transcript will be stored in sync_record.json
store_transcript(filename, transcript)
using Pyttsx3
Abridged from speak_custom.py
import pyttsx3
def speak_text(text):
engine = pyttsx3.init()
engine.say(text)
engine.runAndWait()
speak_text('this is a test')
using Google TTS API
speak_google.py
def speak_google(text, filename, model):
"""Synthesizes speech from the input string of text."""
from google.cloud import texttospeech
client = texttospeech.TextToSpeechClient()
input_text = texttospeech.types.SynthesisInput(text=text)
# Note: the voice can also be specified by name.
# Names of voices can be retrieved with client.list_voices().
voice = texttospeech.types.VoiceSelectionParams(
language_code='en-US',
ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE,
name=model)
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.MP3)
response = client.synthesize_speech(input_text, voice, audio_config)
# The response's audio_content is binary.
with open(filename, 'wb') as out:
out.write(response.audio_content)
print('Audio content written to file %s'%(filename))
# experiment with various voices
base='output'
models=['en-US-Wavenet-A','en-US-Wavenet-B','en-US-Wavenet-C','en-US-Wavenet-D',
'en-US-Wavenet-E','en-US-Wavenet-F']
text='hey I am testing out google TTS'
# loop through various voices
# now all these files will be in the current directory
for i in range(len(models)):
speak_google(text, base+'_'+models[i]+'.mp3', models[i])
If you are interested to read more on any of these topics, check out the documentation below.
Reading/writing voice files
Manipulating voice files
Audio file playback
Recording audio files
Audio file conversion
Transcription
Text-to-speech systems (TTS)