diff --git a/README.md b/README.md
index 4f88cfe..fc1d98c 100644
--- a/README.md
+++ b/README.md
@@ -54,6 +54,7 @@ https://github.com/user-attachments/assets/af2c544b-6d46-4c44-87d8-9a051ba213db
 - [x] ⚡ Streaming support for response.
 - [x] 🎙️ Add offline STT support: WhisperCPP.
 - [x] 🧠 Knowledge database: LlamaIndex
+- [x] ⌖ Screen Spot: Screen capture and analyze with OCR
 - [ ] 🔌 Plugin system for extensibility.
 - [ ] 📰 News and weather updates.
 - [ ] 📧 Email integration with Gmail and Outlook.
diff --git a/llama_assistant/config.py b/llama_assistant/config.py
index b966ff3..e602a11 100644
--- a/llama_assistant/config.py
+++ b/llama_assistant/config.py
@@ -150,6 +150,7 @@
 custom_models_file = llama_assistant_dir / "custom_models.json"
 settings_file = llama_assistant_dir / "settings.json"
 document_icon = "llama_assistant/resources/document_icon.png"
+ocr_tmp_file = llama_assistant_dir / "ocr_tmp.png"
 
 if custom_models_file.exists():
     with open(custom_models_file, "r") as f:
diff --git a/llama_assistant/icons.py b/llama_assistant/icons.py
index c4564d5..4fd5dff 100644
--- a/llama_assistant/icons.py
+++ b/llama_assistant/icons.py
@@ -29,6 +29,16 @@
 </svg>
 """
 
+crosshair_icon_svg = """
+<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+  <circle cx="12" cy="12" r="6"></circle>
+  <line x1="12" y1="2" x2="12" y2="6"></line>
+  <line x1="12" y1="18" x2="12" y2="22"></line>
+  <line x1="2" y1="12" x2="6" y2="12"></line>
+  <line x1="18" y1="12" x2="22" y2="12"></line>
+</svg>
+"""
+
 
 def create_icon_from_svg(svg_string):
     svg_bytes = QByteArray(svg_string.encode("utf-8"))
diff --git a/llama_assistant/llama_assistant_app.py b/llama_assistant/llama_assistant_app.py
index f6e4d5e..9caa99d 100644
--- a/llama_assistant/llama_assistant_app.py
+++ b/llama_assistant/llama_assistant_app.py
@@ -14,11 +14,14 @@
     QVBoxLayout,
     QMessageBox,
     QSystemTrayIcon,
+    QRubberBand
 )
 from PyQt5.QtCore import (
     Qt,
     QPoint,
     QTimer,
+    QSize,
+    QRect
 )
 from PyQt5.QtGui import (
     QPixmap,
@@ -36,9 +39,10 @@
 from llama_assistant.setting_dialog import SettingsDialog
 from llama_assistant.speech_recognition_thread import SpeechRecognitionThread
 from llama_assistant.utils import image_to_base64_data_uri
-from llama_assistant.processing_thread import ProcessingThread
+from llama_assistant.processing_thread import ProcessingThread, OCRThread
 from llama_assistant.ui_manager import UIManager
 from llama_assistant.tray_manager import TrayManager
+from llama_assistant.screen_capture_widget import ScreenCaptureWidget
 from llama_assistant.setting_validator import validate_numeric_field
 from llama_assistant.utils import load_image
 
@@ -50,6 +54,7 @@ def __init__(self):
         self.load_settings()
         self.ui_manager = UIManager(self)
         self.tray_manager = TrayManager(self)
+        self.screen_capture_widget = ScreenCaptureWidget(self)
         self.setup_global_shortcut()
         self.last_response = ""
         self.dropped_image = None
@@ -62,6 +67,12 @@ def __init__(self):
         self.current_multimodal_model = self.settings.get("multimodal_model")
         self.processing_thread = None
         self.markdown_creator = mistune.create_markdown()
+        self.gen_mark_down = True
+        self.has_ocr_context = False
+
+    def capture_screenshot(self):
+        self.hide()
+        self.screen_capture_widget.show()
 
     def tray_icon_activated(self, reason):
         if reason == QSystemTrayIcon.ActivationReason.Trigger:
@@ -183,6 +194,37 @@ def toggle_visibility(self):
             self.raise_()
             self.ui_manager.input_field.setFocus()
 
+    def on_ocr_button_clicked(self):
+        self.show()
+        self.show_chat_box()
+        self.screen_capture_widget.hide()
+
+        self.last_response = ""
+        self.gen_mark_down = False
+        
+        self.ui_manager.chat_box.append(f'<div></div><span style="color: #aaa;"><b>You:</b></span> OCR this captured region')
+        self.ui_manager.chat_box.append('<span style="color: #aaa;"><b>AI:</b></span> ')
+       
+        self.start_cursor_pos = self.ui_manager.chat_box.textCursor().position()
+
+        img_path = config.ocr_tmp_file
+        if not img_path.exists():
+            print("No image find for OCR")
+            self.ui_manager.chat_box.append('No image found for OCR')
+            return
+        
+        self.processing_thread = OCRThread(img_path, streaming=True)
+        self.processing_thread.preloader_signal.connect(self.indicate_loading)
+        self.processing_thread.update_signal.connect(self.update_chat_box)
+        self.processing_thread.finished_signal.connect(self.on_processing_finished)
+        self.processing_thread.start()
+
+    def on_ask_with_ocr_context(self):
+        self.show()
+        self.screen_capture_widget.hide()
+        self.has_ocr_context = True
+
+
     def on_submit(self):
         message = self.ui_manager.input_field.toPlainText()
         if message == "":
@@ -196,10 +238,11 @@ def on_submit(self):
 
             for file_path in self.dropped_files:
                 self.remove_file_thumbnail(self.file_containers[file_path], file_path)
-
+            
             return
 
         self.last_response = ""
+        self.gen_mark_down = True
 
         if self.dropped_image:
             self.process_image_with_prompt(self.dropped_image, self.dropped_files, message)
@@ -244,6 +287,7 @@ def process_text(self, message, file_paths, task="chat"):
             self.rag_setting,
             prompt,
             lookup_files=file_paths,
+            ocr_img_path=config.ocr_tmp_file if self.has_ocr_context else None,
         )
 
         self.processing_thread.preloader_signal.connect(self.indicate_loading)
@@ -251,6 +295,8 @@ def process_text(self, message, file_paths, task="chat"):
         self.processing_thread.finished_signal.connect(self.on_processing_finished)
         self.processing_thread.start()
 
+        self.has_ocr_context = False
+
     def process_image_with_prompt(self, image_path, file_paths, prompt):
         self.show_chat_box()
         self.ui_manager.chat_box.append(
@@ -270,20 +316,27 @@ def process_image_with_prompt(self, image_path, file_paths, prompt):
             prompt,
             image=image,
             lookup_files=file_paths,
+            ocr_img_path=config.ocr_tmp_file if self.has_ocr_context else None
         )
         self.processing_thread.preloader_signal.connect(self.indicate_loading)
         self.processing_thread.update_signal.connect(self.update_chat_box)
         self.processing_thread.finished_signal.connect(self.on_processing_finished)
         self.processing_thread.start()
 
+        self.has_ocr_context = False
+
+    def clear_text_from_start_pos(self):
+        cursor = self.ui_manager.chat_box.textCursor()
+        cursor.setPosition(self.start_cursor_pos)
+        # Select all text from the start_pos to the end
+        cursor.movePosition(QTextCursor.End, QTextCursor.KeepAnchor)
+        # Remove the selected text
+        cursor.removeSelectedText()
+
     def indicate_loading(self, message):
         while self.processing_thread.is_preloading():
+            self.clear_text_from_start_pos()
             cursor = self.ui_manager.chat_box.textCursor()
-            cursor.setPosition(self.start_cursor_pos)
-            # Select all text from the start_pos to the end
-            cursor.movePosition(QTextCursor.End, QTextCursor.KeepAnchor)
-            # Remove the selected text
-            cursor.removeSelectedText()
             # create animation where the characters are displayed one by one
             for c in message:
                 if c == " ":
@@ -293,23 +346,24 @@ def indicate_loading(self, message):
                 QApplication.processEvents()  # Process events to update the UI
                 time.sleep(0.05)
             time.sleep(0.5)
-
+        
     def update_chat_box(self, text):
         self.last_response += text
-        markdown_response = self.markdown_creator(self.last_response)
-        # Since cannot change the font size of the h1, h2 tag, we will replace it with h3
-        markdown_response = markdown_response.replace("<h1>", "<h3>").replace("</h1>", "</h3>")
-        markdown_response = markdown_response.replace("<h2>", "<h3>").replace("</h2>", "</h3>")
-        markdown_response += "<div></div>"
+
+        formatted_text = ""
+        if self.gen_mark_down:
+            markdown_response = self.markdown_creator(self.last_response)
+            # Since cannot change the font size of the h1, h2 tag, we will replace it with h3
+            markdown_response = markdown_response.replace("<h1>", "<h3>").replace("</h1>", "</h3>")
+            markdown_response = markdown_response.replace("<h2>", "<h3>").replace("</h2>", "</h3>")
+            markdown_response += "<div></div>"
+            formatted_text = markdown_response
+        else:
+            formatted_text = self.last_response.replace("\n", "<br>") + "<div></div>"
+        
+        self.clear_text_from_start_pos()
         cursor = self.ui_manager.chat_box.textCursor()
-        cursor.setPosition(
-            self.start_cursor_pos
-        )  # regenerate the updated text from the start position
-        # Select all text from the start_pos to the end
-        cursor.movePosition(QTextCursor.End, QTextCursor.KeepAnchor)
-        # Remove the selected text
-        cursor.removeSelectedText()
-        cursor.insertHtml(markdown_response)
+        cursor.insertHtml(formatted_text)
         self.ui_manager.chat_box.verticalScrollBar().setValue(
             self.ui_manager.chat_box.verticalScrollBar().maximum()
         )
@@ -514,14 +568,6 @@ def remove_image_thumbnail(self):
             self.ui_manager.input_field.setPlaceholderText("Ask me anything...")
             self.setFixedHeight(self.height() - 110)  # Decrease height after removing image
 
-    def mousePressEvent(self, event):
-        self.oldPos = event.globalPos()
-
-    def mouseMoveEvent(self, event):
-        delta = QPoint(event.globalPos() - self.oldPos)
-        self.move(self.x() + delta.x(), self.y() + delta.y())
-        self.oldPos = event.globalPos()
-
     def on_wake_word_detected(self, model_name):
         self.show()
         self.activateWindow()
diff --git a/llama_assistant/model_handler.py b/llama_assistant/model_handler.py
index c149b18..6b8323e 100644
--- a/llama_assistant/model_handler.py
+++ b/llama_assistant/model_handler.py
@@ -214,6 +214,10 @@ def chat_completion(
         return response
 
     def update_chat_history(self, message: str, role: str):
+        if self.loaded_agent is None:
+            print("Agent has not been initialized. Cannot update chat history.")
+            return
+        
         agent = self.loaded_agent.get("agent")
         if agent:
             agent.chat_history.add_message({"role": role, "content": message})
diff --git a/llama_assistant/ocr_engine.py b/llama_assistant/ocr_engine.py
new file mode 100644
index 0000000..847dc3d
--- /dev/null
+++ b/llama_assistant/ocr_engine.py
@@ -0,0 +1,152 @@
+from paddleocr import PaddleOCR
+from PIL import Image
+import copy
+import numpy as np
+import time
+
+def group_boxes_to_lines(bboxes, vertical_tolerance=5):
+    """
+    Groups bounding boxes into lines based on vertical alignment.
+    
+    Args:
+        bboxes: List of bounding boxes [(xmin, ymin, xmax, ymax)].
+        vertical_tolerance: Tolerance for vertical proximity to consider boxes in the same line.
+    
+    Returns:
+        List of lines, where each line is a list of bounding boxes.
+    """
+    # Sort bounding boxes by ymin (top edge)
+    bboxes = sorted(bboxes, key=lambda bbox: bbox[1])
+    
+    lines = []
+    current_line = []
+    current_ymin = None
+
+    for bbox in bboxes:
+        xmin, ymin, xmax, ymax = bbox
+        
+        # Check if starting a new line or current box is not vertically aligned
+        if current_ymin is None or ymin > current_ymin + vertical_tolerance:
+            # Save the current line if not empty
+            if current_line:
+                # Sort current line by xmin
+                current_line.sort(key=lambda box: box[0])
+                lines.append(current_line)
+            # Start a new line
+            current_line = [bbox]
+            current_ymin = ymin
+        else:
+            # Add box to the current line
+            current_line.append(bbox)
+    
+    # Add the last line if any
+    if current_line:
+        current_line.sort(key=lambda box: box[0])
+        lines.append(current_line)
+    
+    return lines
+
+def quad_to_rect(quad_boxes):
+    """
+    Converts a quadrilateral bounding box to a rectangular bounding box.
+    
+    Args:
+        quad_boxes: List of 4 points [(x1, y1), (x2, y2), (x3, y3), (x4, y4)] representing the quadrilateral.
+    
+    Returns:
+        List of rectangular bounding box (xmin, ymin, xmax, ymax).
+    """
+    result = []
+    for quad_box in quad_boxes:
+        quad_box = quad_box.astype(np.int32)
+        # Extract all x and y coordinates
+        x_coords = quad_box[:, 0]
+        y_coords = quad_box[:, 1]
+
+        # Find the enclosing rectangle
+        xmin = np.min(x_coords)
+        ymin = np.min(y_coords)
+        xmax = np.max(x_coords)
+        ymax = np.max(y_coords)
+
+        result.append((xmin, ymin, xmax, ymax))
+    
+    return result
+
+class OCREngine:
+    def __init__(self):
+        self.ocr = None
+        
+    def load_ocr(self, processign_thread=None):
+        if self.ocr is None:
+            if processign_thread:
+                processign_thread.set_preloading(True, "Initializing OCR ....")
+
+            self.ocr = PaddleOCR(use_angle_cls=True, lang='en')
+            time.sleep(1.2)
+
+            if processign_thread:
+                processign_thread.set_preloading(False, "...")
+
+    def perform_ocr(self, img_path, streaming=False, processing_thread=None):
+        self.load_ocr(processing_thread)
+        img = np.array(Image.open(img_path).convert('RGB'))
+        ori_im = img.copy()
+
+        # text detection
+        dt_boxes, _ = self.ocr.text_detector(img)
+        if dt_boxes is None:
+            return None
+        
+        img_crop_list = []
+        dt_boxes = quad_to_rect(dt_boxes)
+
+        # group boxes into lines
+        lines = group_boxes_to_lines(dt_boxes)
+
+        # return a generator if streaming
+        if streaming:
+            def generate_result():
+                for boxes in lines:
+                    img_crop_list = []
+                    for bno in range(len(boxes)):
+                        xmin, ymin, xmax, ymax = copy.deepcopy(boxes[bno])
+                        img_crop = ori_im[ymin:ymax, xmin:xmax]
+                        if any([dim <= 0 for dim in img_crop.shape[:2]]):
+                            continue
+                        img_crop_list.append(img_crop)
+                    rec_res, _ = self.ocr.text_recognizer(img_crop_list)
+            
+                    line_text = ""
+                    for rec_result in rec_res:
+                        text, score = rec_result
+                        if score >= self.ocr.drop_score:
+                            line_text += text + " "
+
+                        yield line_text+'\n'
+
+            return generate_result()
+        
+        # non-streaming
+        full_result = ""
+        for boxes in lines:
+            img_crop_list = []
+            for bno in range(len(boxes)):
+                xmin, ymin, xmax, ymax = copy.deepcopy(boxes[bno])
+                img_crop = ori_im[ymin:ymax, xmin:xmax]
+                if any([dim <= 0 for dim in img_crop.shape[:2]]):
+                    continue
+                img_crop_list.append(img_crop)
+            rec_res, _ = self.ocr.text_recognizer(img_crop_list)
+    
+            line_text = ""
+            for rec_result in rec_res:
+                text, score = rec_result
+                if score >= self.ocr.drop_score:
+                    line_text += text + " "
+
+            full_result += line_text + '\n'
+
+        return full_result
+    
+ocr_engine = OCREngine()
\ No newline at end of file
diff --git a/llama_assistant/processing_thread.py b/llama_assistant/processing_thread.py
index 3cc7a8b..04bc6c2 100644
--- a/llama_assistant/processing_thread.py
+++ b/llama_assistant/processing_thread.py
@@ -4,7 +4,7 @@
     pyqtSignal,
 )
 from llama_assistant.model_handler import handler as model_handler
-
+from llama_assistant.ocr_engine import ocr_engine
 
 class ProcessingThread(QThread):
     preloader_signal = pyqtSignal(str)
@@ -19,6 +19,7 @@ def __init__(
         prompt: str,
         lookup_files: Optional[Set[str]] = None,
         image: str = None,
+        ocr_img_path: str = None,
     ):
         super().__init__()
         self.model = model
@@ -27,9 +28,17 @@ def __init__(
         self.prompt = prompt
         self.image = image
         self.lookup_files = lookup_files
-        self.preloadong = False
+        self.preloading = False
+        self.ocr_img_path = ocr_img_path
 
     def run(self):
+        if self.ocr_img_path:
+            self.set_preloading(True, "Thinking ....")
+            ocr_output = ocr_engine.perform_ocr(self.ocr_img_path, streaming=False)
+            ocr_output = f"Here is the OCR result:\n{ocr_output}\n"
+            self.prompt = ocr_output + self.prompt
+            print("Prompt with OCR context:", self.prompt)
+
         output = model_handler.chat_completion(
             self.model,
             self.generation_setting,
@@ -65,3 +74,44 @@ def set_preloading(self, preloading: bool, message: str):
 
     def is_preloading(self):
         return self.preloading
+
+class OCRThread(QThread):
+    preloader_signal = pyqtSignal(str)
+    update_signal = pyqtSignal(str)
+    finished_signal = pyqtSignal()
+
+    def __init__(self, img_path: str, streaming: bool = False):
+        super().__init__()
+        self.img_path = img_path
+        self.preloading = False
+        self.streaming = streaming
+        self.is_ocr_done = False
+
+    def emit_preloading_message(self, message: str):
+        self.preloader_signal.emit(message)
+
+    def set_preloading(self, preloading: bool, message: str):
+        self.preloading = preloading
+        self.emit_preloading_message(message)
+
+    def is_preloading(self):
+        return self.preloading
+    
+    def run(self):
+        output = ocr_engine.perform_ocr(self.img_path, streaming=self.streaming, processing_thread=self)
+        full_response_str = "Here is the OCR result:\n"
+        self.is_ocr_done = True
+
+        if not self.streaming and type(output) == str:
+            self.update_signal.emit(full_response_str + output)
+            return
+        
+        self.update_signal.emit(full_response_str)
+        for chunk in output:
+            self.update_signal.emit(chunk)
+            full_response_str += chunk
+
+        model_handler.update_chat_history("OCR this image:", "user")
+        model_handler.update_chat_history(full_response_str, "assistant")
+        self.finished_signal.emit()
+        
\ No newline at end of file
diff --git a/llama_assistant/screen_capture_widget.py b/llama_assistant/screen_capture_widget.py
new file mode 100644
index 0000000..8523578
--- /dev/null
+++ b/llama_assistant/screen_capture_widget.py
@@ -0,0 +1,166 @@
+from typing import TYPE_CHECKING
+from PyQt5.QtWidgets import QApplication, QWidget, QDesktopWidget, QPushButton
+from PyQt5.QtCore import Qt, QRect
+from PyQt5.QtGui import QPainter, QColor, QPen
+
+from llama_assistant import config
+from llama_assistant.ocr_engine import OCREngine
+if TYPE_CHECKING:
+    from llama_assistant.llama_assistant_app import LlamaAssistantApp
+
+
+class ScreenCaptureWidget(QWidget):
+    def __init__(self, parent: "LlamaAssistantApp"):
+        super().__init__()
+        self.setWindowFlags(Qt.FramelessWindowHint)
+        
+        self.parent = parent
+        self.ocr_engine = OCREngine()
+        
+        # Get screen size
+        screen = QDesktopWidget().screenGeometry()
+        self.setGeometry(0, 0, screen.width(), screen.height())
+        
+        # Set crosshairs cursor
+        self.setCursor(Qt.CrossCursor)
+        
+        # To store the start and end points of the mouse region
+        self.start_point = None
+        self.end_point = None
+
+        # Buttons to appear after selection
+        self.button_widget = QWidget()
+        self.ocr_button = QPushButton("OCR", self.button_widget)
+        self.ask_button = QPushButton("Ask", self.button_widget)
+        self.ocr_button.setCursor(Qt.PointingHandCursor)
+        self.ask_button.setCursor(Qt.PointingHandCursor)
+        opacity = self.parent.settings.get("transparency", 90) / 100
+        base_style = f"""
+            border: none;
+            border-radius: 20px;
+            color: white;
+            padding: 10px 15px;
+            font-size: 16px;
+        """
+        button_style = f"""
+            QPushButton {{
+                {base_style}
+                padding: 2.5px 5px;
+                border-radius: 5px;
+                background-color: rgba{QColor(self.parent.settings["color"]).lighter(120).getRgb()[:3] + (opacity,)};
+            }}
+        """
+        self.ocr_button.setStyleSheet(button_style)
+        self.ask_button.setStyleSheet(button_style)
+        self.button_widget.hide()
+
+        # Connect button signals
+        self.ocr_button.clicked.connect(self.parent.on_ocr_button_clicked)
+        self.ask_button.clicked.connect(self.parent.on_ask_with_ocr_context)
+
+    def show(self):
+        # remove painting if any
+        self.start_point = None
+        self.end_point = None
+        self.update()
+
+        # Set window opacity to 50%
+        self.setWindowOpacity(0.5)
+        # self.setAttribute(Qt.WA_TranslucentBackground, True)
+        super().show()
+
+    def hide(self):
+        self.button_widget.hide()
+        super().hide()
+
+        
+    def mousePressEvent(self, event):
+        if event.button() == Qt.LeftButton:
+            self.start_point = event.pos()  # Capture start position
+            self.end_point = event.pos()    # Initialize end point to start position
+            print(f"Mouse press at {self.start_point}")
+        
+    def mouseReleaseEvent(self, event):
+        if event.button() == Qt.LeftButton:
+            self.end_point = event.pos()  # Capture end position
+            
+            print(f"Mouse release at {self.end_point}")
+            
+            # Capture the region between start and end points
+            if self.start_point and self.end_point:
+                self.capture_region(self.start_point, self.end_point)
+                
+            # Trigger repaint to show the red rectangle
+            self.update()
+
+        self.show_buttons()
+
+                
+    def mouseMoveEvent(self, event):
+        if self.start_point:
+            # Update the end_point to the current mouse position as it moves
+            self.end_point = event.pos()
+            
+            # Trigger repaint to update the rectangle
+            self.update()
+            
+    def capture_region(self, start_point, end_point):
+        # Convert local widget coordinates to global screen coordinates
+        start_global = self.mapToGlobal(start_point)
+        end_global = self.mapToGlobal(end_point)
+        
+        # Create a QRect from the global start and end points
+        region_rect = QRect(start_global, end_global)
+        
+        # Ensure the rectangle is valid (non-negative width/height)
+        region_rect = region_rect.normalized()
+        
+        # Capture the screen region
+        screen = QApplication.primaryScreen()
+        pixmap = screen.grabWindow(0, region_rect.x(), region_rect.y(), region_rect.width(), region_rect.height())
+
+        # Save the captured region as an image
+        pixmap.save(str(config.ocr_tmp_file), "PNG")
+        print(f"Captured region saved at '{config.ocr_tmp_file}'.")
+    
+    def paintEvent(self, event):
+        # If the start and end points are set, draw the rectangle
+        if self.start_point and self.end_point:
+            # Create a painter object
+            painter = QPainter(self)
+            
+            # Set the pen color to red
+            pen = QPen(QColor(255, 0, 0))  # Red color
+            pen.setWidth(3)  # Set width of the border
+            painter.setPen(pen)
+            
+            # Draw the rectangle from start_point to end_point
+            self.region_rect = QRect(self.start_point, self.end_point)
+            self.region_rect = self.region_rect.normalized()  # Normalize to ensure correct width/height
+            
+            painter.drawRect(self.region_rect)  # Draw the rectangle
+
+        super().paintEvent(event)  # Call the base class paintEvent
+
+    def show_buttons(self):
+        if self.start_point and self.end_point:
+            # Get normalized rectangle
+            rect = QRect(self.start_point, self.end_point).normalized()
+
+            # Calculate button positions
+            button_y = rect.bottom() + 10  # Place buttons below the rectangle
+            button_width = 80
+            button_height = 30
+            spacing = 10
+
+            print("Showing buttons")
+
+            self.ocr_button.setGeometry(0, 0, button_width, button_height)
+            self.ask_button.setGeometry(button_width + spacing, 0, button_width, button_height)
+
+            self.button_widget.setGeometry(rect.left(), button_y, 2 * button_width + spacing, button_height)
+            self.button_widget.setAttribute(Qt.WA_TranslucentBackground)
+            self.button_widget.setWindowFlags(Qt.FramelessWindowHint)
+            self.button_widget.show()
+
+    
\ No newline at end of file
diff --git a/llama_assistant/ui_manager.py b/llama_assistant/ui_manager.py
index 651f155..817e60e 100644
--- a/llama_assistant/ui_manager.py
+++ b/llama_assistant/ui_manager.py
@@ -24,6 +24,7 @@
     copy_icon_svg,
     clear_icon_svg,
     microphone_icon_svg,
+    crosshair_icon_svg
 )
 
 
@@ -87,6 +88,7 @@ def init_ui(self):
         self.input_field.dropEvent = self.parent.dropEvent
         input_layout.addWidget(self.input_field)
 
+        button_layout = QVBoxLayout()
         self.mic_button = QPushButton(self.parent)
         self.mic_button.setIcon(create_icon_from_svg(microphone_icon_svg))
         self.mic_button.setIconSize(QSize(24, 24))
@@ -104,7 +106,29 @@ def init_ui(self):
             }
         """
         )
-        input_layout.addWidget(self.mic_button)
+        button_layout.addWidget(self.mic_button)
+
+        self.screenshot_button = QPushButton(self.parent)
+        self.screenshot_button.setIcon(create_icon_from_svg(crosshair_icon_svg))
+        self.screenshot_button.setIconSize(QSize(24, 24))
+        self.screenshot_button.setFixedSize(40, 40)
+        self.screenshot_button.clicked.connect(self.parent.capture_screenshot)
+        self.screenshot_button.setToolTip("Screen Spot")
+        self.screenshot_button.setStyleSheet(
+            """
+            QPushButton {
+                background-color: rgba(100, 100, 100, 200);
+                border: none;
+                border-radius: 20px;
+            }
+            QPushButton:hover {
+                background-color: rgba(100, 100, 100, 230);
+            }
+        """
+        )
+        button_layout.addWidget(self.screenshot_button)
+
+        input_layout.addLayout(button_layout)
 
         close_button = QPushButton("×", self.parent)
         close_button.clicked.connect(self.parent.hide)
diff --git a/pyproject.toml b/pyproject.toml
index e0a3512..a792bc6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,8 @@ dependencies = [
     "llama-index-embeddings-huggingface==0.4.0",
     "docx2txt==0.8",
     "mistune==3.0.2",
+    "paddlepaddle==2.6.2",
+    "paddleocr==2.9.1",
     "whispercpp @ git+https://github.com/stlukey/whispercpp.py"
 ]
 dynamic = []
diff --git a/requirements.txt b/requirements.txt
index 172f12a..f2f417b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,4 +13,6 @@ llama-index-readers-file==0.4.0
 llama-index-embeddings-huggingface==0.4.0
 docx2txt==0.8
 mistune==3.0.2
+paddlepaddle==2.6.2
+paddleocr==2.9.1
 git+https://github.com/stlukey/whispercpp.py