Skip to content

Commit

Permalink
Support multimodal model: MoonDream 2
Browse files Browse the repository at this point in the history
  • Loading branch information
vietanhdev committed Sep 28, 2024
1 parent b418cd9 commit 298fcb3
Show file tree
Hide file tree
Showing 12 changed files with 376 additions and 79 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@ This assistant can run offline on your local machine, and it respects your priva

## TODO

- [ ] Support other text models: Llama 3.x.
- [ ] Support multimodal models: LLaVA, Llama 3.2 + Vision.
- [x] Support multimodal model: [moondream2](https://huggingface.co/vikhyatk/moondream2).
- [ ] Add offline STT support: WhisperCPP.
- [ ] Add wake word detection: "Hey Llama!".
- [ ] Knowledge database.
- [ ] Video interaction support.
- [ ] Support 5 other text models.
- [ ] Support 5 other multimodal models.
- [ ] Knowledge database: Langchain or LlamaIndex?.
- [ ] Plugin system for extensibility.
- [ ] Package for Windows, Linux, and macOS.

## Features

Expand Down
8 changes: 6 additions & 2 deletions llama_assistant/custom_plaintext_editor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from PyQt6.QtGui import QKeyEvent
from PyQt6.QtCore import Qt, pyqtSignal


class CustomPlainTextEdit(QPlainTextEdit):
submit = pyqtSignal()

Expand All @@ -19,7 +20,10 @@ def __init__(self, submit_callback, parent=None):
)

def keyPressEvent(self, event: QKeyEvent):
if event.key() == Qt.Key.Key_Return and not event.modifiers() & Qt.KeyboardModifier.ShiftModifier:
if (
event.key() == Qt.Key.Key_Return
and not event.modifiers() & Qt.KeyboardModifier.ShiftModifier
):
self.submit.emit()
else:
super().keyPressEvent(event)
super().keyPressEvent(event)
8 changes: 4 additions & 4 deletions llama_assistant/global_hotkey.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from PyQt6.QtCore import QObject, pyqtSignal
from pynput import keyboard


class GlobalHotkey(QObject):
activated = pyqtSignal()

def __init__(self, hotkey):
super().__init__()
self.hotkey = keyboard.HotKey(
keyboard.HotKey.parse(hotkey),
self.on_activate
keyboard.HotKey.parse(hotkey), self.on_activate
)
self.listener = keyboard.Listener(
on_press=self.for_canonical(self.hotkey.press),
on_release=self.for_canonical(self.hotkey.release)
on_release=self.for_canonical(self.hotkey.release),
)
self.listener.start()

Expand All @@ -24,4 +24,4 @@ def for_canonical(self, f):

def stop(self):
if self.listener:
self.listener.stop()
self.listener.stop()
169 changes: 116 additions & 53 deletions llama_assistant/llama_assistant.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import markdown
from llama_cpp import Llama
from pathlib import Path
from importlib import resources

from PyQt6.QtWidgets import (
QApplication,
Expand All @@ -14,6 +14,7 @@
QSystemTrayIcon,
QMenu,
QLabel,
QScrollArea,
)
from PyQt6.QtCore import (
Qt,
Expand Down Expand Up @@ -41,7 +42,8 @@
from llama_assistant.loading_animation import LoadingAnimation
from llama_assistant.setting_dialog import SettingsDialog
from llama_assistant.speech_recognition import SpeechRecognitionThread
from importlib import resources
from llama_assistant.utils import image_to_base64_data_uri
from llama_assistant.model_handler import handler as model_handler


class LlamaAssistant(QMainWindow):
Expand All @@ -51,19 +53,12 @@ def __init__(self):
self.init_ui()
self.init_tray()
self.setup_global_shortcut()
self.load_model()
self.last_response = ""
self.dropped_image = None
self.speech_thread = None
self.is_listening = False
self.image_label = None

def load_model(self):
self.model = Llama.from_pretrained(
repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
filename="*q4_k_m.gguf",
)

def load_settings(self):
home_dir = Path.home()
settings_dir = home_dir / "llama_assistant"
Expand Down Expand Up @@ -112,7 +107,8 @@ def init_ui(self):
self.setWindowTitle("AI Assistant")
self.setFixedSize(600, 200)
self.setWindowFlags(
Qt.WindowType.FramelessWindowHint | Qt.WindowType.WindowStaysOnTopHint
Qt.WindowType.FramelessWindowHint
| Qt.WindowType.WindowStaysOnTopHint
)
self.setAttribute(Qt.WidgetAttribute.WA_TranslucentBackground)

Expand All @@ -132,6 +128,7 @@ def init_ui(self):
self.input_field = CustomPlainTextEdit(self.on_submit, self)
self.input_field.setPlaceholderText("Ask me anything...")
self.input_field.setAcceptDrops(True)
self.input_field.setFixedHeight(100)
self.input_field.dragEnterEvent = self.dragEnterEvent
self.input_field.dropEvent = self.dropEvent
self.input_field.setStyleSheet(
Expand All @@ -150,7 +147,9 @@ def init_ui(self):
top_layout.addWidget(self.input_field)

# Load the mic icon from resources
with resources.path("llama_assistant.resources", "mic_icon.png") as path:
with resources.path(
"llama_assistant.resources", "mic_icon.png"
) as path:
mic_icon = QIcon(str(path))

self.mic_button = QPushButton(self)
Expand Down Expand Up @@ -197,38 +196,73 @@ def init_ui(self):

# Add new buttons
button_layout = QHBoxLayout()
button_layout.setAlignment(Qt.AlignmentFlag.AlignLeft)
self.summarize_button = QPushButton("Summarize", self)
self.rephrase_button = QPushButton("Rephrase", self)
self.fix_grammar_button = QPushButton("Fix Grammar", self)
self.brainstorm_button = QPushButton("Brainstorm", self)
self.write_email_button = QPushButton("Write Email", self)

for button in [self.summarize_button, self.rephrase_button, self.fix_grammar_button, self.brainstorm_button, self.write_email_button]:
button.clicked.connect(self.on_task_button_clicked)
button_layout.addWidget(button)

main_layout.addLayout(button_layout)

self.chat_box = QTextBrowser(self)
self.chat_box.setOpenExternalLinks(True)
self.chat_box.setFixedHeight(300)
self.chat_box.hide()
main_layout.addWidget(self.chat_box)

result_button_layout = QHBoxLayout()
result_button_layout.setContentsMargins(0, 10, 0, 0) # Add top margin

# Add new buttons to layout
result_layout = QHBoxLayout()
result_layout.setAlignment(Qt.AlignmentFlag.AlignLeft)
self.copy_button = QPushButton("Copy Result", self)
self.copy_button.clicked.connect(self.copy_result)
self.copy_button.hide()
result_button_layout.addWidget(self.copy_button)

self.clear_button = QPushButton("Clear", self)
self.clear_button.clicked.connect(self.clear_chat)
self.clear_button.hide()
result_button_layout.addWidget(self.clear_button)
result_layout.addWidget(self.copy_button)
result_layout.addWidget(self.clear_button)

for button in [
self.summarize_button,
self.rephrase_button,
self.fix_grammar_button,
self.brainstorm_button,
self.write_email_button,
]:
button.clicked.connect(self.on_task_button_clicked)
button_layout.addWidget(button)

main_layout.addLayout(result_button_layout)
main_layout.addLayout(button_layout)
main_layout.addLayout(result_layout)

# Create a scroll area for the chat box
self.scroll_area = QScrollArea(self)
self.scroll_area.setWidgetResizable(True)
self.scroll_area.setHorizontalScrollBarPolicy(
Qt.ScrollBarPolicy.ScrollBarAlwaysOff
)
self.scroll_area.setStyleSheet(
"""
QScrollArea {
border: none;
background-color: transparent;
}
QScrollBar:vertical {
border: none;
background: rgba(255, 255, 255, 0.1);
width: 10px;
margin: 0px 0px 0px 0px;
}
QScrollBar::handle:vertical {
background: rgba(255, 255, 255, 0.3);
min-height: 20px;
border-radius: 5px;
}
QScrollBar::add-line:vertical, QScrollBar::sub-line:vertical {
border: none;
background: none;
}
"""
)

self.chat_box = QTextBrowser(self.scroll_area)
self.chat_box.setOpenExternalLinks(True)
self.scroll_area.setWidget(self.chat_box)
self.scroll_area.hide()
main_layout.addWidget(self.scroll_area)

self.loading_animation = LoadingAnimation(self)
self.loading_animation.setFixedSize(50, 50)
Expand Down Expand Up @@ -276,7 +310,26 @@ def update_styles(self):
background-color: rgba{QColor(self.settings["color"]).lighter(120).getRgb()[:3] + (opacity,)};
}}
"""
for button in [self.copy_button, self.clear_button, self.rephrase_button, self.fix_grammar_button, self.brainstorm_button, self.write_email_button, self.summarize_button]:
for button in [
self.rephrase_button,
self.fix_grammar_button,
self.brainstorm_button,
self.write_email_button,
self.summarize_button,
]:
button.setStyleSheet(button_style)

button_style = f"""
QPushButton {{
{base_style}
padding: 2.5px 5px;
border-radius: 5px;
}}
QPushButton:hover {{
background-color: rgba(200, 200, 200, 0.8);
}}
"""
for button in [self.copy_button, self.clear_button]:
button.setStyleSheet(button_style)

def center_on_screen(self):
Expand Down Expand Up @@ -329,7 +382,9 @@ def toggle_visibility(self):
def on_submit(self):
message = self.input_field.toPlainText()
self.input_field.clear()
self.loading_animation.move(self.width() // 2 - 25, self.height() // 2 - 25)
self.loading_animation.move(
self.width() // 2 - 25, self.height() // 2 - 25
)
self.loading_animation.start_animation()

if self.dropped_image:
Expand All @@ -353,39 +408,36 @@ def process_text(self, message, task="chat"):
elif task == "write email":
prompt = f"Write an email about: {message}"

output = self.model.create_chat_completion(
messages = [
{
"role": "user",
"content": prompt
}
]
)
response = output["choices"][0]["message"]["content"]

response = model_handler.chat_completion("llama_text", prompt)
self.last_response = response

self.chat_box.append(f"<b>You:</b> {message}")
self.chat_box.append(f"<b>AI ({task}):</b> {markdown.markdown(response)}")
self.chat_box.append(
f"<b>AI ({task}):</b> {markdown.markdown(response)}"
)
self.loading_animation.stop_animation()
self.show_chat_box()

def process_image_with_prompt(self, image_path, prompt):
response = model_handler.chat_completion(
"moondream", prompt, image=image_to_base64_data_uri(image_path)
)
self.chat_box.append(f"<b>You:</b> [Uploaded an image: {image_path}]")
self.chat_box.append(f"<b>You:</b> {prompt}")
self.chat_box.append(
f"<b>AI:</b> I've received your image and prompt. However, image processing is not yet implemented. Currently, only text input is supported. Here's a response to your text prompt:"
f"<b>AI:</b> {markdown.markdown(response)}"
if response
else "No response"
)

# Process the text prompt
self.process_text(prompt)
self.loading_animation.stop_animation()
self.show_chat_box()

def show_chat_box(self):
if self.chat_box.isHidden():
self.chat_box.show()
if self.scroll_area.isHidden():
self.scroll_area.show()
self.copy_button.show()
self.clear_button.show()
self.setFixedHeight(450)
self.setFixedHeight(600) # Increase this value if needed
self.chat_box.verticalScrollBar().setValue(
self.chat_box.verticalScrollBar().maximum()
)
Expand All @@ -409,9 +461,13 @@ def dragEnterEvent(self, event: QDragEnterEvent):
def dropEvent(self, event: QDropEvent):
files = [u.toLocalFile() for u in event.mimeData().urls()]
for file_path in files:
if file_path.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
if file_path.lower().endswith(
(".png", ".jpg", ".jpeg", ".gif", ".bmp")
):
self.dropped_image = file_path
self.input_field.setPlaceholderText("Enter a prompt for the image...")
self.input_field.setPlaceholderText(
"Enter a prompt for the image..."
)
self.show_image_thumbnail(file_path)
break

Expand Down Expand Up @@ -549,4 +605,11 @@ def on_speech_recognized(self, text):
self.input_field.setPlainText(text)

def on_speech_error(self, error_message):
print(error_message)
print(error_message)


if __name__ == "__main__":
app = QApplication([])
assistant = LlamaAssistant()
assistant.show()
app.exec()
6 changes: 4 additions & 2 deletions llama_assistant/loading_animation.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ def paintEvent(self, event):
painter.setBrush(color)
painter.setPen(Qt.PenStyle.NoPen)

painter.drawEllipse(QPointF(x, y), self.dot_radius, self.dot_radius)
painter.drawEllipse(
QPointF(x, y), self.dot_radius, self.dot_radius
)

@property
def rotation(self):
Expand All @@ -65,4 +67,4 @@ def rotation(self):
@rotation.setter
def rotation(self, value):
self._rotation = value
self.update()
self.update()
3 changes: 2 additions & 1 deletion llama_assistant/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ def main():
ex.show()
sys.exit(app.exec())


if __name__ == "__main__":
main()
main()
Loading

0 comments on commit 298fcb3

Please sign in to comment.