Skip to content

Commit

Permalink
Add wake word detector: Hey Llama
Browse files Browse the repository at this point in the history
  • Loading branch information
vietanhdev committed Sep 28, 2024
1 parent 6d632ac commit 267bbb6
Show file tree
Hide file tree
Showing 9 changed files with 349 additions and 31 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ This assistant can run offline on your local machine, and it respects your priva
## TODO

- [x] 🖼️ Support multimodal model: [moondream2](https://huggingface.co/vikhyatk/moondream2).
- [ ] 🎙️ Add offline STT support: WhisperCPP.
- [ ] 🗣️ Add wake word detection: "Hey Llama!".
- [x] 🗣️ Add wake word detection: "Hey Llama!".
- [ ] 🎙️ Add offline STT support: WhisperCPP. [Experimental Code](llama_assistant/speech_recognition_whisper_experimental.py).
- [ ] 📚 Support 5 other text models.
- [ ] 🖼️ Support 5 other multimodal models.
- [ ] 🧠 Knowledge database: Langchain or LlamaIndex?.
Expand Down
35 changes: 34 additions & 1 deletion llama_assistant/llama_assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
QFont,
QBitmap,
)
from llama_assistant.wake_word_detector import WakeWordDetector

from llama_assistant.custom_plaintext_editor import CustomPlainTextEdit
from llama_assistant.global_hotkey import GlobalHotkey
Expand All @@ -54,6 +55,7 @@
class LlamaAssistant(QMainWindow):
def __init__(self):
super().__init__()
self.wake_word_detector = None
self.load_settings()
self.init_ui()
self.init_tray()
Expand All @@ -66,6 +68,18 @@ def __init__(self):
self.current_text_model = self.settings.get("text_model")
self.current_multimodal_model = self.settings.get("multimodal_model")

def init_wake_word_detector(self):
if self.wake_word_detector is not None:
self.deinit_wake_word_detector()
self.wake_word_detector = WakeWordDetector()
self.wake_word_detector.wakeword_detected.connect(self.on_wake_word_detected)
self.wake_word_detector.start()

def deinit_wake_word_detector(self):
if self.wake_word_detector.running:
self.wake_word_detector.stop()
self.wake_word_detector = None

def load_settings(self):
home_dir = Path.home()
settings_dir = home_dir / "llama_assistant"
Expand All @@ -90,8 +104,14 @@ def load_settings(self):
"transparency": 90,
"text_model": "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
"multimodal_model": "vikhyatk/moondream2",
"hey_llama_chat": False,
"hey_llama_mic": False,
}
self.save_settings()
if self.settings.get("hey_llama_chat", False) and self.wake_word_detector is None:
self.init_wake_word_detector()
if not self.settings.get("hey_llama_chat", False) and self.wake_word_detector is not None:
self.deinit_wake_word_detector()
self.current_text_model = self.settings.get("text_model")
self.current_multimodal_model = self.settings.get("multimodal_model")

Expand Down Expand Up @@ -577,6 +597,13 @@ def mouseMoveEvent(self, event):
self.move(self.x() + delta.x(), self.y() + delta.y())
self.oldPos = event.globalPosition().toPoint()

def on_wake_word_detected(self, model_name):
self.show()
self.activateWindow()
self.raise_()
if self.settings.get("hey_llama_mic", False):
self.start_voice_input()

def toggle_voice_input(self):
if not self.is_listening:
self.start_voice_input()
Expand Down Expand Up @@ -627,9 +654,15 @@ def on_speech_recognized(self, text):
self.input_field.setPlainText(f"{current_text}\n{text}")
else:
self.input_field.setPlainText(text)
self.stop_voice_input()

def on_speech_error(self, error_message):
print(error_message)
print(f"Speech recognition error: {error_message}")
self.stop_voice_input()

def closeEvent(self, event):
self.wake_word_detector.stop()
super().closeEvent(event)


if __name__ == "__main__":
Expand Down
Binary file added llama_assistant/resources/wk_hey_llama.onnx
Binary file not shown.
95 changes: 71 additions & 24 deletions llama_assistant/setting_dialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
QVBoxLayout,
QHBoxLayout,
QWidget,
QCheckBox,
QGroupBox,
)
from PyQt6.QtCore import pyqtSignal
from PyQt6.QtCore import Qt
Expand All @@ -28,52 +30,88 @@ def __init__(self, parent=None):
self.setWindowTitle("Settings")
self.main_layout = QVBoxLayout(self)

# Create a form layout for the settings
form_widget = QWidget()
self.layout = QFormLayout(form_widget)
self.layout.setFormAlignment(Qt.AlignmentFlag.AlignLeft)
self.layout.setLabelAlignment(Qt.AlignmentFlag.AlignLeft)
# General Settings Group
self.create_general_settings_group()

# Appearance Settings Group
self.create_appearance_settings_group()

# Model Settings Group
self.create_model_settings_group()

# Voice Activation Settings Group
self.create_voice_activation_settings_group()

# Create a horizontal layout for the save button
button_layout = QHBoxLayout()
self.save_button = QPushButton("Save")
self.save_button.clicked.connect(self.accept)
button_layout.addStretch()
button_layout.addWidget(self.save_button)

# Add the button layout to the main layout
self.main_layout.addLayout(button_layout)

self.load_settings()

def create_general_settings_group(self):
group_box = QGroupBox("General Settings")
layout = QFormLayout()

self.shortcut_recorder = ShortcutRecorder()
self.layout.addRow("Shortcut:", self.shortcut_recorder)
layout.addRow("Shortcut:", self.shortcut_recorder)

self.reset_shortcut_button = QPushButton("Reset Shortcut")
self.reset_shortcut_button.clicked.connect(self.reset_shortcut)
self.layout.addRow(self.reset_shortcut_button)
layout.addRow(self.reset_shortcut_button)

group_box.setLayout(layout)
self.main_layout.addWidget(group_box)

def create_appearance_settings_group(self):
group_box = QGroupBox("Appearance Settings")
layout = QFormLayout()

self.color_button = QPushButton("Choose Color")
self.color_button.clicked.connect(self.choose_color)
self.layout.addRow("Background Color:", self.color_button)
layout.addRow("Background Color:", self.color_button)

self.transparency_slider = QSlider(Qt.Orientation.Horizontal)
self.transparency_slider.setRange(10, 100)
self.transparency_slider.setValue(90)
self.layout.addRow("Transparency:", self.transparency_slider)
layout.addRow("Transparency:", self.transparency_slider)

group_box.setLayout(layout)
self.main_layout.addWidget(group_box)

def create_model_settings_group(self):
group_box = QGroupBox("Model Settings")
layout = QFormLayout()

# Text-only model selection
self.text_model_combo = QComboBox()
self.text_model_combo.addItems(self.get_model_names_by_type("text"))
self.layout.addRow("Text-only Model:", self.text_model_combo)
layout.addRow("Text-only Model:", self.text_model_combo)

# Multimodal model selection
self.multimodal_model_combo = QComboBox()
self.multimodal_model_combo.addItems(self.get_model_names_by_type("image"))
self.layout.addRow("Multimodal Model:", self.multimodal_model_combo)
layout.addRow("Multimodal Model:", self.multimodal_model_combo)

# Add the form widget to the main layout
self.main_layout.addWidget(form_widget)
group_box.setLayout(layout)
self.main_layout.addWidget(group_box)

# Create a horizontal layout for the save button
button_layout = QHBoxLayout()
self.save_button = QPushButton("Save")
self.save_button.clicked.connect(self.accept)
button_layout.addStretch()
button_layout.addWidget(self.save_button)
def create_voice_activation_settings_group(self):
group_box = QGroupBox("Voice Activation Settings")
layout = QVBoxLayout()

# Add the button layout to the main layout
self.main_layout.addLayout(button_layout)
self.hey_llama_chat_checkbox = QCheckBox('Say "Hey Llama" to open chat form')
self.hey_llama_chat_checkbox.stateChanged.connect(self.update_hey_llama_mic_state)
layout.addWidget(self.hey_llama_chat_checkbox)

self.load_settings()
self.hey_llama_mic_checkbox = QCheckBox('Say "Hey Llama" to activate microphone')
layout.addWidget(self.hey_llama_mic_checkbox)

group_box.setLayout(layout)
self.main_layout.addWidget(group_box)

def accept(self):
self.save_settings()
Expand All @@ -91,6 +129,9 @@ def choose_color(self):
def reset_shortcut(self):
self.shortcut_recorder.setText("<cmd>+<shift>+<space>")

def update_hey_llama_mic_state(self, state):
self.hey_llama_mic_checkbox.setEnabled(state == Qt.CheckState.Checked.value)

def load_settings(self):
home_dir = Path.home()
settings_file = home_dir / "llama_assistant" / "settings.json"
Expand All @@ -109,6 +150,10 @@ def load_settings(self):
multimodal_model = settings.get("multimodal_model")
if multimodal_model in self.get_model_names_by_type("image"):
self.multimodal_model_combo.setCurrentText(multimodal_model)

self.hey_llama_chat_checkbox.setChecked(settings.get("hey_llama_chat", False))
self.hey_llama_mic_checkbox.setChecked(settings.get("hey_llama_mic", False))
self.update_hey_llama_mic_state(settings.get("hey_llama_chat", False))
else:
self.color = QColor("#1E1E1E")
self.shortcut_recorder.setText("<cmd>+<shift>+<space>")
Expand All @@ -120,6 +165,8 @@ def get_settings(self):
"transparency": self.transparency_slider.value(),
"text_model": self.text_model_combo.currentText(),
"multimodal_model": self.multimodal_model_combo.currentText(),
"hey_llama_chat": self.hey_llama_chat_checkbox.isChecked(),
"hey_llama_mic": self.hey_llama_mic_checkbox.isChecked(),
}

def save_settings(self):
Expand Down
126 changes: 126 additions & 0 deletions llama_assistant/speech_recognition_whisper_experimental.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import threading
import queue
import pyaudio
import wave
import os
from pathlib import Path
import datetime
from whisper_cpp_python import Whisper
import re
import requests


class SpeechRecognition:
def __init__(self):
# Audio settings
self.RATE = 16000
self.CHUNK = self.RATE
self.NB_CHANNELS = 1
self.RECORD_SECONDS = 1

# Whisper settings
self.WHISPER_LANGUAGE = "en"
self.WHISPER_THREADS = 1

# Initialize queues
self.audio_queue = queue.Queue()
self.text_queue = queue.Queue()

# Set up model path and download if necessary
self.model_dir = Path.home() / "llama-assistant" / "models" / "whisper-cpp"
self.model_path = self.model_dir / "ggml-tiny-fp16.bin"
self.download_model_if_needed()

# Initialize Whisper model
self.whisper = Whisper(model_path=str(self.model_path), n_threads=self.WHISPER_THREADS)

# Initialize PyAudio
self.audio = pyaudio.PyAudio()
self.stream = self.audio.open(
format=pyaudio.paInt16,
channels=self.NB_CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK,
)

# Create temporary folder for audio files
self.tmp_audio_folder = Path("./tmp_audio")
if not self.tmp_audio_folder.exists():
self.tmp_audio_folder.mkdir()

self.stop_listening = False

def download_model_if_needed(self):
if not self.model_path.exists():
print("Downloading Whisper model...")
self.model_dir.mkdir(parents=True, exist_ok=True)
url = "https://huggingface.co/danielus/ggml-whisper-models/resolve/main/ggml-tiny-fp16.bin"
response = requests.get(url)
with open(self.model_path, "wb") as f:
f.write(response.content)
print("Model downloaded successfully.")

def listen(self):
while not self.stop_listening:
audio_data = self.stream.read(self.CHUNK)
self.audio_queue.put(audio_data)

def transcribe(self):
while not self.stop_listening:
if not self.audio_queue.empty():
audio_data = self.audio_queue.get()

# Save audio data to temporary file
tmp_filepath = f"./tmp_audio/output_{datetime.datetime.now()}.wav"
with wave.open(tmp_filepath, "wb") as wf:
wf.setnchannels(self.NB_CHANNELS)
wf.setsampwidth(2) # 16-bit audio
wf.setframerate(self.RATE)
wf.writeframes(audio_data)

# Transcribe audio
res = self.whisper.transcribe(file=tmp_filepath, language=self.WHISPER_LANGUAGE)
transcription = res["text"]

# Clean up transcription
transcription = re.sub(r"\[.*\]", "", transcription)
transcription = re.sub(r"\(.*\)", "", transcription)

# Add transcription to text queue
self.text_queue.put(transcription)

# Cleanup
os.remove(tmp_filepath)

def start(self):
self.stop_listening = False
threading.Thread(target=self.listen, daemon=True).start()
threading.Thread(target=self.transcribe, daemon=True).start()

def stop(self):
self.stop_listening = True
self.stream.stop_stream()
self.stream.close()
self.audio.terminate()

def get_transcription(self):
if not self.text_queue.empty():
return self.text_queue.get()
return None


# Example usage
if __name__ == "__main__":
recognizer = SpeechRecognition()
recognizer.start()

print("Speech recognition started. Press Ctrl+C to stop.")
try:
while True:
transcription = recognizer.get_transcription()
if transcription:
print(f"Transcription: {transcription}")
except KeyboardInterrupt:
print("Stopping speech recognition...")
recognizer.stop()
Loading

0 comments on commit 267bbb6

Please sign in to comment.