Skip to content

Commit

Permalink
Use WhisperCPP as offline speech recognizer
Browse files Browse the repository at this point in the history
  • Loading branch information
vietanhdev committed Oct 4, 2024
1 parent 3865f98 commit 88605ed
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 179 deletions.
39 changes: 19 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,17 @@ This assistant can run offline on your local machine, and it respects your priva

![Settings](https://raw.githubusercontent.com/vietanhdev/llama-assistant/refs/heads/main/docs/custom-models.png)


## Supported Models

- 📝 Text-only models:
- [Llama 3.2](https://github.com/facebookresearch/llama) - 1B, 3B (4/8-bit quantized)
- [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF) (4-bit quantized)
- [Llama 3.2](https://github.com/facebookresearch/llama) - 1B, 3B (4/8-bit quantized).
- [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF) (4-bit quantized).
- And other models that [LlamaCPP](https://github.com/ggerganov/llama.cpp) supports via custom models. [See the list](https://github.com/ggerganov/llama.cpp).

- 🖼️ Multimodal models:
- [Moondream2](https://huggingface.co/vikhyatk/moondream2)
- [MiniCPM-v2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf)
- [LLaVA 1.5/1.6](https://llava-vl.github.io/)
- [Moondream2](https://huggingface.co/vikhyatk/moondream2).
- [MiniCPM-v2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf).
- [LLaVA 1.5/1.6](https://llava-vl.github.io/).
- Besides supported models, you can try other variants via custom models.

## TODO
Expand All @@ -45,7 +44,7 @@ This assistant can run offline on your local machine, and it respects your priva
- [x] 📚 Support 5 other text models.
- [x] 🖼️ Support 5 other multimodal models.
- [x] ⚡ Streaming support for response.
- [ ] 🎙️ Add offline STT support: WhisperCPP (WIP - [Experimental Code](llama_assistant/speech_recognition_whisper_experimental.py)).
- [x] 🎙️ Add offline STT support: WhisperCPP.
- [ ] 🧠 Knowledge database: Langchain or LlamaIndex?.
- [ ] 🔌 Plugin system for extensibility.
- [ ] 📰 News and weather updates.
Expand All @@ -59,11 +58,11 @@ This assistant can run offline on your local machine, and it respects your priva

## Features

- 🎙️ Voice recognition for hands-free interaction
- 💬 Natural language processing with Llama 3.2
- 🖼️ Image analysis capabilities (TODO)
- ⚡ Global hotkey for quick access (Cmd+Shift+Space on macOS)
- 🎨 Customizable UI with adjustable transparency
- 🎙️ Voice recognition for hands-free interaction.
- 💬 Natural language processing with Llama 3.2.
- 🖼️ Image analysis capabilities (TODO).
- ⚡ Global hotkey for quick access (Cmd+Shift+Space on macOS).
- 🎨 Customizable UI with adjustable transparency.

**Note:** This project is a work in progress, and new features are being added regularly.

Expand All @@ -89,17 +88,17 @@ pip install pyaudio

1. Clone the repository:

```bash
git clone https://github.com/vietanhdev/llama-assistant.git
cd llama-assistant
```
```bash
git clone https://github.com/vietanhdev/llama-assistant.git
cd llama-assistant
```

2. Install the required dependencies:

```bash
pip install -r requirements.txt
pip install pyaudio
```
```bash
pip install -r requirements.txt
pip install pyaudio
```

</details>

Expand Down
2 changes: 1 addition & 1 deletion llama_assistant/llama_assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
from llama_assistant.custom_plaintext_editor import CustomPlainTextEdit
from llama_assistant.global_hotkey import GlobalHotkey
from llama_assistant.setting_dialog import SettingsDialog
from llama_assistant.speech_recognition import SpeechRecognitionThread
from llama_assistant.speech_recognition_thread import SpeechRecognitionThread
from llama_assistant.utils import image_to_base64_data_uri, load_image
from llama_assistant.model_handler import handler as model_handler
from llama_assistant.icons import (
Expand Down
31 changes: 0 additions & 31 deletions llama_assistant/speech_recognition.py

This file was deleted.

138 changes: 138 additions & 0 deletions llama_assistant/speech_recognition_thread.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import pkgutil
from pathlib import Path
import datetime
import os
import re
import requests

from PyQt6.QtCore import QThread, pyqtSignal
import speech_recognition as sr

# patch whisper on file not find error
# https://github.com/carloscdias/whisper-cpp-python/pull/12
try:
import whisper_cpp_python
except FileNotFoundError:
regex = r"(\"darwin\":\n\s*lib_ext = \")\.so(\")"
subst = "\\1.dylib\\2"

print("fixing and re-importing whisper_cpp_python...")
# load whisper_cpp_python and substitute .so with .dylib for darwin
package = pkgutil.get_loader("whisper_cpp_python")
whisper_path = Path(package.path)
whisper_cpp_py = whisper_path.parent.joinpath("whisper_cpp.py")
content = whisper_cpp_py.read_text()
result = re.sub(regex, subst, content, 0, re.MULTILINE)
whisper_cpp_py.write_text(result)

import whisper_cpp_python


class SpeechRecognitionThread(QThread):
finished = pyqtSignal(str)
error = pyqtSignal(str)
WHISPER_THREADS = 4
WHISPER_LANGUAGE = "en"

def __init__(self):
super().__init__()
self.stop_listening = False

# Set up model path and download if necessary
self.model_dir = Path.home() / "llama-assistant" / "models" / "whisper-cpp"
self.model_path = self.model_dir / "ggml-base-fp16.bin"
self.download_model_if_needed()

# Initialize Whisper model
self.whisper = whisper_cpp_python.Whisper(
model_path=str(self.model_path), n_threads=self.WHISPER_THREADS
)

# Create temporary folder for audio files
self.tmp_audio_folder = Path.home() / "llama-assistant" / "tmp_audio"
self.tmp_audio_folder.mkdir(parents=True, exist_ok=True)

def download_model_if_needed(self):
if not self.model_path.exists():
print("Downloading Whisper model...")
self.model_dir.mkdir(parents=True, exist_ok=True)
url = "https://huggingface.co/danielus/ggml-whisper-models/resolve/main/ggml-base-fp16.bin"
response = requests.get(url)
with open(self.model_path, "wb") as f:
f.write(response.content)
print("Model downloaded successfully.")

def run(self):
recognizer = sr.Recognizer()
microphone = sr.Microphone()
try:
with microphone as source:
recognizer.adjust_for_ambient_noise(source)
while not self.stop_listening:
try:
recognizer.pause_threshold = 1
audio_data = recognizer.listen(source, timeout=1, phrase_time_limit=5)

# Save audio data to temporary file
tmp_filepath = (
self.tmp_audio_folder / f"temp_audio_{datetime.datetime.now()}.wav"
)
with open(tmp_filepath, "wb") as f:
f.write(audio_data.get_wav_data())

# Transcribe audio
res = self.whisper.transcribe(
file=tmp_filepath, language=self.WHISPER_LANGUAGE
)
transcription = res["text"]

# Clean up transcription
transcription = re.sub(r"\[.*\]", "", transcription)
transcription = re.sub(r"\(.*\)", "", transcription)

print(f"Transcription: {transcription}")
os.remove(tmp_filepath)

self.finished.emit(transcription)
except sr.WaitTimeoutError:
print("timeout")
continue
except sr.UnknownValueError:
print("Could not understand audio")
self.error.emit("Could not understand audio")
except sr.RequestError as e:
print(f"Could not request results; {e}")
self.error.emit(f"Could not request results; {e}")
except KeyboardInterrupt:
print("Keyboard interrupt detected. Stopping speech recognition.")
self.stop()

def stop(self):
self.stop_listening = True


# Demo code
if __name__ == "__main__":
from PyQt6.QtWidgets import QApplication
import sys

app = QApplication(sys.argv)

def on_finished(text):
print(f"Transcription: {text}")
thread.stop()
app.quit()

def on_error(error_message):
print(f"Error: {error_message}")
thread.stop()
app.quit()

thread = SpeechRecognitionThread()
thread.finished.connect(on_finished)
thread.error.connect(on_error)

print("Starting speech recognition. Speak into your microphone...")
thread.start()

sys.exit(app.exec())
126 changes: 0 additions & 126 deletions llama_assistant/speech_recognition_whisper_experimental.py

This file was deleted.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "llama-assistant"
version = "0.1.26"
version = "0.1.28"
authors = [
{name = "Viet-Anh Nguyen", email = "[email protected]"},
]
Expand Down

0 comments on commit 88605ed

Please sign in to comment.