Add wake word detector: Hey Llama

nrl-ai · Sep 28, 2024 · 267bbb6 · 267bbb6
1 parent 6d632ac
commit 267bbb6
Show file tree

Hide file tree

Showing 9 changed files with 349 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -33,8 +33,8 @@ This assistant can run offline on your local machine, and it respects your priva
 ## TODO
 
 - [x] 🖼️ Support multimodal model: [moondream2](https://huggingface.co/vikhyatk/moondream2).
-- [ ] 🎙️ Add offline STT support: WhisperCPP.
-- [ ] 🗣️ Add wake word detection: "Hey Llama!".
+- [x] 🗣️ Add wake word detection: "Hey Llama!".
+- [ ] 🎙️ Add offline STT support: WhisperCPP. [Experimental Code](llama_assistant/speech_recognition_whisper_experimental.py).
 - [ ] 📚 Support 5 other text models.
 - [ ] 🖼️ Support 5 other multimodal models.
 - [ ] 🧠 Knowledge database: Langchain or LlamaIndex?.

diff --git a/llama_assistant/llama_assistant.py b/llama_assistant/llama_assistant.py
@@ -36,6 +36,7 @@
     QFont,
     QBitmap,
 )
+from llama_assistant.wake_word_detector import WakeWordDetector
 
 from llama_assistant.custom_plaintext_editor import CustomPlainTextEdit
 from llama_assistant.global_hotkey import GlobalHotkey
@@ -54,6 +55,7 @@
 class LlamaAssistant(QMainWindow):
     def __init__(self):
         super().__init__()
+        self.wake_word_detector = None
         self.load_settings()
         self.init_ui()
         self.init_tray()
@@ -66,6 +68,18 @@ def __init__(self):
         self.current_text_model = self.settings.get("text_model")
         self.current_multimodal_model = self.settings.get("multimodal_model")
 
+    def init_wake_word_detector(self):
+        if self.wake_word_detector is not None:
+            self.deinit_wake_word_detector()
+        self.wake_word_detector = WakeWordDetector()
+        self.wake_word_detector.wakeword_detected.connect(self.on_wake_word_detected)
+        self.wake_word_detector.start()
+
+    def deinit_wake_word_detector(self):
+        if self.wake_word_detector.running:
+            self.wake_word_detector.stop()
+        self.wake_word_detector = None
+
     def load_settings(self):
         home_dir = Path.home()
         settings_dir = home_dir / "llama_assistant"
@@ -90,8 +104,14 @@ def load_settings(self):
                 "transparency": 90,
                 "text_model": "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
                 "multimodal_model": "vikhyatk/moondream2",
+                "hey_llama_chat": False,
+                "hey_llama_mic": False,
             }
             self.save_settings()
+        if self.settings.get("hey_llama_chat", False) and self.wake_word_detector is None:
+            self.init_wake_word_detector()
+        if not self.settings.get("hey_llama_chat", False) and self.wake_word_detector is not None:
+            self.deinit_wake_word_detector()
         self.current_text_model = self.settings.get("text_model")
         self.current_multimodal_model = self.settings.get("multimodal_model")
 
@@ -577,6 +597,13 @@ def mouseMoveEvent(self, event):
         self.move(self.x() + delta.x(), self.y() + delta.y())
         self.oldPos = event.globalPosition().toPoint()
 
+    def on_wake_word_detected(self, model_name):
+        self.show()
+        self.activateWindow()
+        self.raise_()
+        if self.settings.get("hey_llama_mic", False):
+            self.start_voice_input()
+
     def toggle_voice_input(self):
         if not self.is_listening:
             self.start_voice_input()
@@ -627,9 +654,15 @@ def on_speech_recognized(self, text):
             self.input_field.setPlainText(f"{current_text}\n{text}")
         else:
             self.input_field.setPlainText(text)
+        self.stop_voice_input()
 
     def on_speech_error(self, error_message):
-        print(error_message)
+        print(f"Speech recognition error: {error_message}")
+        self.stop_voice_input()
+
+    def closeEvent(self, event):
+        self.wake_word_detector.stop()
+        super().closeEvent(event)
 
 
 if __name__ == "__main__":

diff --git a/llama_assistant/resources/wk_hey_llama.onnx b/llama_assistant/resources/wk_hey_llama.onnx
diff --git a/llama_assistant/setting_dialog.py b/llama_assistant/setting_dialog.py
@@ -11,6 +11,8 @@
     QVBoxLayout,
     QHBoxLayout,
     QWidget,
+    QCheckBox,
+    QGroupBox,
 )
 from PyQt6.QtCore import pyqtSignal
 from PyQt6.QtCore import Qt
@@ -28,52 +30,88 @@ def __init__(self, parent=None):
         self.setWindowTitle("Settings")
         self.main_layout = QVBoxLayout(self)
 
-        # Create a form layout for the settings
-        form_widget = QWidget()
-        self.layout = QFormLayout(form_widget)
-        self.layout.setFormAlignment(Qt.AlignmentFlag.AlignLeft)
-        self.layout.setLabelAlignment(Qt.AlignmentFlag.AlignLeft)
+        # General Settings Group
+        self.create_general_settings_group()
+
+        # Appearance Settings Group
+        self.create_appearance_settings_group()
+
+        # Model Settings Group
+        self.create_model_settings_group()
+
+        # Voice Activation Settings Group
+        self.create_voice_activation_settings_group()
+
+        # Create a horizontal layout for the save button
+        button_layout = QHBoxLayout()
+        self.save_button = QPushButton("Save")
+        self.save_button.clicked.connect(self.accept)
+        button_layout.addStretch()
+        button_layout.addWidget(self.save_button)
+
+        # Add the button layout to the main layout
+        self.main_layout.addLayout(button_layout)
+
+        self.load_settings()
+
+    def create_general_settings_group(self):
+        group_box = QGroupBox("General Settings")
+        layout = QFormLayout()
 
         self.shortcut_recorder = ShortcutRecorder()
-        self.layout.addRow("Shortcut:", self.shortcut_recorder)
+        layout.addRow("Shortcut:", self.shortcut_recorder)
 
         self.reset_shortcut_button = QPushButton("Reset Shortcut")
         self.reset_shortcut_button.clicked.connect(self.reset_shortcut)
-        self.layout.addRow(self.reset_shortcut_button)
+        layout.addRow(self.reset_shortcut_button)
+
+        group_box.setLayout(layout)
+        self.main_layout.addWidget(group_box)
+
+    def create_appearance_settings_group(self):
+        group_box = QGroupBox("Appearance Settings")
+        layout = QFormLayout()
 
         self.color_button = QPushButton("Choose Color")
         self.color_button.clicked.connect(self.choose_color)
-        self.layout.addRow("Background Color:", self.color_button)
+        layout.addRow("Background Color:", self.color_button)
 
         self.transparency_slider = QSlider(Qt.Orientation.Horizontal)
         self.transparency_slider.setRange(10, 100)
         self.transparency_slider.setValue(90)
-        self.layout.addRow("Transparency:", self.transparency_slider)
+        layout.addRow("Transparency:", self.transparency_slider)
+
+        group_box.setLayout(layout)
+        self.main_layout.addWidget(group_box)
+
+    def create_model_settings_group(self):
+        group_box = QGroupBox("Model Settings")
+        layout = QFormLayout()
 
-        # Text-only model selection
         self.text_model_combo = QComboBox()
         self.text_model_combo.addItems(self.get_model_names_by_type("text"))
-        self.layout.addRow("Text-only Model:", self.text_model_combo)
+        layout.addRow("Text-only Model:", self.text_model_combo)
 
-        # Multimodal model selection
         self.multimodal_model_combo = QComboBox()
         self.multimodal_model_combo.addItems(self.get_model_names_by_type("image"))
-        self.layout.addRow("Multimodal Model:", self.multimodal_model_combo)
+        layout.addRow("Multimodal Model:", self.multimodal_model_combo)
 
-        # Add the form widget to the main layout
-        self.main_layout.addWidget(form_widget)
+        group_box.setLayout(layout)
+        self.main_layout.addWidget(group_box)
 
-        # Create a horizontal layout for the save button
-        button_layout = QHBoxLayout()
-        self.save_button = QPushButton("Save")
-        self.save_button.clicked.connect(self.accept)
-        button_layout.addStretch()
-        button_layout.addWidget(self.save_button)
+    def create_voice_activation_settings_group(self):
+        group_box = QGroupBox("Voice Activation Settings")
+        layout = QVBoxLayout()
 
-        # Add the button layout to the main layout
-        self.main_layout.addLayout(button_layout)
+        self.hey_llama_chat_checkbox = QCheckBox('Say "Hey Llama" to open chat form')
+        self.hey_llama_chat_checkbox.stateChanged.connect(self.update_hey_llama_mic_state)
+        layout.addWidget(self.hey_llama_chat_checkbox)
 
-        self.load_settings()
+        self.hey_llama_mic_checkbox = QCheckBox('Say "Hey Llama" to activate microphone')
+        layout.addWidget(self.hey_llama_mic_checkbox)
+
+        group_box.setLayout(layout)
+        self.main_layout.addWidget(group_box)
 
     def accept(self):
         self.save_settings()
@@ -91,6 +129,9 @@ def choose_color(self):
     def reset_shortcut(self):
         self.shortcut_recorder.setText("<cmd>+<shift>+<space>")
 
+    def update_hey_llama_mic_state(self, state):
+        self.hey_llama_mic_checkbox.setEnabled(state == Qt.CheckState.Checked.value)
+
     def load_settings(self):
         home_dir = Path.home()
         settings_file = home_dir / "llama_assistant" / "settings.json"
@@ -109,6 +150,10 @@ def load_settings(self):
             multimodal_model = settings.get("multimodal_model")
             if multimodal_model in self.get_model_names_by_type("image"):
                 self.multimodal_model_combo.setCurrentText(multimodal_model)
+
+            self.hey_llama_chat_checkbox.setChecked(settings.get("hey_llama_chat", False))
+            self.hey_llama_mic_checkbox.setChecked(settings.get("hey_llama_mic", False))
+            self.update_hey_llama_mic_state(settings.get("hey_llama_chat", False))
         else:
             self.color = QColor("#1E1E1E")
             self.shortcut_recorder.setText("<cmd>+<shift>+<space>")
@@ -120,6 +165,8 @@ def get_settings(self):
             "transparency": self.transparency_slider.value(),
             "text_model": self.text_model_combo.currentText(),
             "multimodal_model": self.multimodal_model_combo.currentText(),
+            "hey_llama_chat": self.hey_llama_chat_checkbox.isChecked(),
+            "hey_llama_mic": self.hey_llama_mic_checkbox.isChecked(),
         }
 
     def save_settings(self):

diff --git a/llama_assistant/speech_recognition_whisper_experimental.py b/llama_assistant/speech_recognition_whisper_experimental.py
@@ -0,0 +1,126 @@
+import threading
+import queue
+import pyaudio
+import wave
+import os
+from pathlib import Path
+import datetime
+from whisper_cpp_python import Whisper
+import re
+import requests
+
+
+class SpeechRecognition:
+    def __init__(self):
+        # Audio settings
+        self.RATE = 16000
+        self.CHUNK = self.RATE
+        self.NB_CHANNELS = 1
+        self.RECORD_SECONDS = 1
+
+        # Whisper settings
+        self.WHISPER_LANGUAGE = "en"
+        self.WHISPER_THREADS = 1
+
+        # Initialize queues
+        self.audio_queue = queue.Queue()
+        self.text_queue = queue.Queue()
+
+        # Set up model path and download if necessary
+        self.model_dir = Path.home() / "llama-assistant" / "models" / "whisper-cpp"
+        self.model_path = self.model_dir / "ggml-tiny-fp16.bin"
+        self.download_model_if_needed()
+
+        # Initialize Whisper model
+        self.whisper = Whisper(model_path=str(self.model_path), n_threads=self.WHISPER_THREADS)
+
+        # Initialize PyAudio
+        self.audio = pyaudio.PyAudio()
+        self.stream = self.audio.open(
+            format=pyaudio.paInt16,
+            channels=self.NB_CHANNELS,
+            rate=self.RATE,
+            input=True,
+            frames_per_buffer=self.CHUNK,
+        )
+
+        # Create temporary folder for audio files
+        self.tmp_audio_folder = Path("./tmp_audio")
+        if not self.tmp_audio_folder.exists():
+            self.tmp_audio_folder.mkdir()
+
+        self.stop_listening = False
+
+    def download_model_if_needed(self):
+        if not self.model_path.exists():
+            print("Downloading Whisper model...")
+            self.model_dir.mkdir(parents=True, exist_ok=True)
+            url = "https://huggingface.co/danielus/ggml-whisper-models/resolve/main/ggml-tiny-fp16.bin"
+            response = requests.get(url)
+            with open(self.model_path, "wb") as f:
+                f.write(response.content)
+            print("Model downloaded successfully.")
+
+    def listen(self):
+        while not self.stop_listening:
+            audio_data = self.stream.read(self.CHUNK)
+            self.audio_queue.put(audio_data)
+
+    def transcribe(self):
+        while not self.stop_listening:
+            if not self.audio_queue.empty():
+                audio_data = self.audio_queue.get()
+
+                # Save audio data to temporary file
+                tmp_filepath = f"./tmp_audio/output_{datetime.datetime.now()}.wav"
+                with wave.open(tmp_filepath, "wb") as wf:
+                    wf.setnchannels(self.NB_CHANNELS)
+                    wf.setsampwidth(2)  # 16-bit audio
+                    wf.setframerate(self.RATE)
+                    wf.writeframes(audio_data)
+
+                # Transcribe audio
+                res = self.whisper.transcribe(file=tmp_filepath, language=self.WHISPER_LANGUAGE)
+                transcription = res["text"]
+
+                # Clean up transcription
+                transcription = re.sub(r"\[.*\]", "", transcription)
+                transcription = re.sub(r"\(.*\)", "", transcription)
+
+                # Add transcription to text queue
+                self.text_queue.put(transcription)
+
+                # Cleanup
+                os.remove(tmp_filepath)
+
+    def start(self):
+        self.stop_listening = False
+        threading.Thread(target=self.listen, daemon=True).start()
+        threading.Thread(target=self.transcribe, daemon=True).start()
+
+    def stop(self):
+        self.stop_listening = True
+        self.stream.stop_stream()
+        self.stream.close()
+        self.audio.terminate()
+
+    def get_transcription(self):
+        if not self.text_queue.empty():
+            return self.text_queue.get()
+        return None
+
+
+# Example usage
+if __name__ == "__main__":
+    recognizer = SpeechRecognition()
+    recognizer.start()
+
+    print("Speech recognition started. Press Ctrl+C to stop.")
+    try:
+        while True:
+            transcription = recognizer.get_transcription()
+            if transcription:
+                print(f"Transcription: {transcription}")
+    except KeyboardInterrupt:
+        print("Stopping speech recognition...")
+        recognizer.stop()