Use WhisperCPP as offline speech recognizer

nrl-ai · Oct 4, 2024 · 88605ed · 88605ed
1 parent 3865f98
commit 88605ed
Show file tree

Hide file tree

Showing 6 changed files with 159 additions and 179 deletions.
diff --git a/README.md b/README.md
@@ -23,18 +23,17 @@ This assistant can run offline on your local machine, and it respects your priva
 
 ![Settings](https://raw.githubusercontent.com/vietanhdev/llama-assistant/refs/heads/main/docs/custom-models.png)
 
-
 ## Supported Models
 
 - 📝 Text-only models:
-  - [Llama 3.2](https://github.com/facebookresearch/llama) - 1B, 3B (4/8-bit quantized)
-  - [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF) (4-bit quantized)
+  - [Llama 3.2](https://github.com/facebookresearch/llama) - 1B, 3B (4/8-bit quantized).
+  - [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF) (4-bit quantized).
   - And other models that [LlamaCPP](https://github.com/ggerganov/llama.cpp) supports via custom models. [See the list](https://github.com/ggerganov/llama.cpp).
 
 - 🖼️ Multimodal models:
-  - [Moondream2](https://huggingface.co/vikhyatk/moondream2)
-  - [MiniCPM-v2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf)
-  - [LLaVA 1.5/1.6](https://llava-vl.github.io/)
+  - [Moondream2](https://huggingface.co/vikhyatk/moondream2).
+  - [MiniCPM-v2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf).
+  - [LLaVA 1.5/1.6](https://llava-vl.github.io/).
   - Besides supported models, you can try other variants via custom models.
 
 ## TODO
@@ -45,7 +44,7 @@ This assistant can run offline on your local machine, and it respects your priva
 - [x] 📚 Support 5 other text models.
 - [x] 🖼️ Support 5 other multimodal models.
 - [x] ⚡ Streaming support for response.
-- [ ] 🎙️ Add offline STT support: WhisperCPP (WIP - [Experimental Code](llama_assistant/speech_recognition_whisper_experimental.py)).
+- [x] 🎙️ Add offline STT support: WhisperCPP.
 - [ ] 🧠 Knowledge database: Langchain or LlamaIndex?.
 - [ ] 🔌 Plugin system for extensibility.
 - [ ] 📰 News and weather updates.
@@ -59,11 +58,11 @@ This assistant can run offline on your local machine, and it respects your priva
 
 ## Features
 
-- 🎙️ Voice recognition for hands-free interaction
-- 💬 Natural language processing with Llama 3.2
-- 🖼️ Image analysis capabilities (TODO)
-- ⚡ Global hotkey for quick access (Cmd+Shift+Space on macOS)
-- 🎨 Customizable UI with adjustable transparency
+- 🎙️ Voice recognition for hands-free interaction.
+- 💬 Natural language processing with Llama 3.2.
+- 🖼️ Image analysis capabilities (TODO).
+- ⚡ Global hotkey for quick access (Cmd+Shift+Space on macOS).
+- 🎨 Customizable UI with adjustable transparency.
 
 **Note:** This project is a work in progress, and new features are being added regularly.
 
@@ -89,17 +88,17 @@ pip install pyaudio
 
 1. Clone the repository:
 
-   ```bash
-   git clone https://github.com/vietanhdev/llama-assistant.git
-   cd llama-assistant
-   ```
+```bash
+git clone https://github.com/vietanhdev/llama-assistant.git
+cd llama-assistant
+```
 
 2. Install the required dependencies:
 
-   ```bash
-   pip install -r requirements.txt
-   pip install pyaudio
-   ```
+```bash
+pip install -r requirements.txt
+pip install pyaudio
+```
 
 </details>
 

diff --git a/llama_assistant/llama_assistant.py b/llama_assistant/llama_assistant.py
@@ -46,7 +46,7 @@
 from llama_assistant.custom_plaintext_editor import CustomPlainTextEdit
 from llama_assistant.global_hotkey import GlobalHotkey
 from llama_assistant.setting_dialog import SettingsDialog
-from llama_assistant.speech_recognition import SpeechRecognitionThread
+from llama_assistant.speech_recognition_thread import SpeechRecognitionThread
 from llama_assistant.utils import image_to_base64_data_uri, load_image
 from llama_assistant.model_handler import handler as model_handler
 from llama_assistant.icons import (

diff --git a/llama_assistant/speech_recognition.py b/llama_assistant/speech_recognition.py
diff --git a/llama_assistant/speech_recognition_thread.py b/llama_assistant/speech_recognition_thread.py
@@ -0,0 +1,138 @@
+import pkgutil
+from pathlib import Path
+import datetime
+import os
+import re
+import requests
+
+from PyQt6.QtCore import QThread, pyqtSignal
+import speech_recognition as sr
+
+# patch whisper on file not find error
+# https://github.com/carloscdias/whisper-cpp-python/pull/12
+try:
+    import whisper_cpp_python
+except FileNotFoundError:
+    regex = r"(\"darwin\":\n\s*lib_ext = \")\.so(\")"
+    subst = "\\1.dylib\\2"
+
+    print("fixing and re-importing whisper_cpp_python...")
+    # load whisper_cpp_python and substitute .so with .dylib for darwin
+    package = pkgutil.get_loader("whisper_cpp_python")
+    whisper_path = Path(package.path)
+    whisper_cpp_py = whisper_path.parent.joinpath("whisper_cpp.py")
+    content = whisper_cpp_py.read_text()
+    result = re.sub(regex, subst, content, 0, re.MULTILINE)
+    whisper_cpp_py.write_text(result)
+
+    import whisper_cpp_python
+
+
+class SpeechRecognitionThread(QThread):
+    finished = pyqtSignal(str)
+    error = pyqtSignal(str)
+    WHISPER_THREADS = 4
+    WHISPER_LANGUAGE = "en"
+
+    def __init__(self):
+        super().__init__()
+        self.stop_listening = False
+
+        # Set up model path and download if necessary
+        self.model_dir = Path.home() / "llama-assistant" / "models" / "whisper-cpp"
+        self.model_path = self.model_dir / "ggml-base-fp16.bin"
+        self.download_model_if_needed()
+
+        # Initialize Whisper model
+        self.whisper = whisper_cpp_python.Whisper(
+            model_path=str(self.model_path), n_threads=self.WHISPER_THREADS
+        )
+
+        # Create temporary folder for audio files
+        self.tmp_audio_folder = Path.home() / "llama-assistant" / "tmp_audio"
+        self.tmp_audio_folder.mkdir(parents=True, exist_ok=True)
+
+    def download_model_if_needed(self):
+        if not self.model_path.exists():
+            print("Downloading Whisper model...")
+            self.model_dir.mkdir(parents=True, exist_ok=True)
+            url = "https://huggingface.co/danielus/ggml-whisper-models/resolve/main/ggml-base-fp16.bin"
+            response = requests.get(url)
+            with open(self.model_path, "wb") as f:
+                f.write(response.content)
+            print("Model downloaded successfully.")
+
+    def run(self):
+        recognizer = sr.Recognizer()
+        microphone = sr.Microphone()
+        try:
+            with microphone as source:
+                recognizer.adjust_for_ambient_noise(source)
+                while not self.stop_listening:
+                    try:
+                        recognizer.pause_threshold = 1
+                        audio_data = recognizer.listen(source, timeout=1, phrase_time_limit=5)
+
+                        # Save audio data to temporary file
+                        tmp_filepath = (
+                            self.tmp_audio_folder / f"temp_audio_{datetime.datetime.now()}.wav"
+                        )
+                        with open(tmp_filepath, "wb") as f:
+                            f.write(audio_data.get_wav_data())
+
+                        # Transcribe audio
+                        res = self.whisper.transcribe(
+                            file=tmp_filepath, language=self.WHISPER_LANGUAGE
+                        )
+                        transcription = res["text"]
+
+                        # Clean up transcription
+                        transcription = re.sub(r"\[.*\]", "", transcription)
+                        transcription = re.sub(r"\(.*\)", "", transcription)
+
+                        print(f"Transcription: {transcription}")
+                        os.remove(tmp_filepath)
+
+                        self.finished.emit(transcription)
+                    except sr.WaitTimeoutError:
+                        print("timeout")
+                        continue
+                    except sr.UnknownValueError:
+                        print("Could not understand audio")
+                        self.error.emit("Could not understand audio")
+                    except sr.RequestError as e:
+                        print(f"Could not request results; {e}")
+                        self.error.emit(f"Could not request results; {e}")
+        except KeyboardInterrupt:
+            print("Keyboard interrupt detected. Stopping speech recognition.")
+            self.stop()
+
+    def stop(self):
+        self.stop_listening = True
+
+
+# Demo code
+if __name__ == "__main__":
+    from PyQt6.QtWidgets import QApplication
+    import sys
+
+    app = QApplication(sys.argv)
+
+    def on_finished(text):
+        print(f"Transcription: {text}")
+        thread.stop()
+        app.quit()
+
+    def on_error(error_message):
+        print(f"Error: {error_message}")
+        thread.stop()
+        app.quit()
+
+    thread = SpeechRecognitionThread()
+    thread.finished.connect(on_finished)
+    thread.error.connect(on_error)
+
+    print("Starting speech recognition. Speak into your microphone...")
+    thread.start()
+
+    sys.exit(app.exec())
diff --git a/llama_assistant/speech_recognition_whisper_experimental.py b/llama_assistant/speech_recognition_whisper_experimental.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "llama-assistant"
-version = "0.1.26"
+version = "0.1.28"
 authors = [
     {name = "Viet-Anh Nguyen", email = "[email protected]"},
 ]