Make recording smooth

nrl-ai · Oct 6, 2024 · ffa3dd4 · ffa3dd4
1 parent e94cb37
commit ffa3dd4
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 33 deletions.
diff --git a/llama_assistant/llama_assistant_app.py b/llama_assistant/llama_assistant_app.py
@@ -686,19 +686,36 @@ def start_voice_input(self):
             self.mic_button.setStyleSheet(
                 """
                 QPushButton {
-                    background-color: rgba(255, 0, 0, 0.5);
+                    background-color: rgba(240, 150, 20, 0.5);
                     border: none;
                     border-radius: 20px;
                 }
                 QPushButton:hover {
-                    background-color: rgba(255, 0, 0, 0.6);
+                    background-color: rgba(240, 150, 20, 0.6);
                 }
             """
             )
             self.speech_thread = SpeechRecognitionThread()
             self.speech_thread.finished.connect(self.on_speech_recognized)
             self.speech_thread.error.connect(self.on_speech_error)
             self.speech_thread.start()
+
+            # Use QTimer to delay the application of the second style
+            QTimer.singleShot(500, self.update_mic_button_style)
+
+    def update_mic_button_style(self):
+        self.mic_button.setStyleSheet(
+            """
+            QPushButton {
+                background-color: rgba(255, 0, 0, 0.5);
+                border: none;
+                border-radius: 20px;
+            }
+            QPushButton:hover {
+                background-color: rgba(255, 0, 0, 0.6);
+            }
+        """
+        )
 
     def stop_voice_input(self):
         if self.speech_thread and self.speech_thread.isRunning():

diff --git a/llama_assistant/speech_recognition_thread.py b/llama_assistant/speech_recognition_thread.py
@@ -14,10 +14,12 @@ class SpeechRecognitionThread(QThread):
     finished = pyqtSignal(str)
     error = pyqtSignal(str)
     WHISPER_THREADS = 1
+    MAX_RECORDING_TIME = 60  # Maximum recording time in seconds
 
     def __init__(self):
         super().__init__()
         self.stop_listening = False
+        self.recording = False
 
         # Initialize Whisper model
         self.whisper = Whisper("tiny")
@@ -31,46 +33,54 @@ def __init__(self):
         self.CHANNELS = 1
         self.RATE = 16000
         self.CHUNK = 1024
-        self.RECORD_SECONDS = 2  # Reduced to 2 seconds for more frequent transcription
+
+    def wait_for_recording(self):
+        while not self.recording:
+            time.sleep(0.1)
 
     def run(self):
         self.stop_listening = False
         audio = pyaudio.PyAudio()
+        frames = []
+        start_time = time.time()
 
         try:
             stream = audio.open(format=self.FORMAT, channels=self.CHANNELS,
                                 rate=self.RATE, input=True,
                                 frames_per_buffer=self.CHUNK)
 
             print("Always-on microphone activated. Listening...")
-
-            while not self.stop_listening:
-                frames = []
-                for _ in range(0, int(self.RATE / self.CHUNK * self.RECORD_SECONDS)):
-                    data = stream.read(self.CHUNK)
-                    frames.append(data)
-
-                # Save audio data to temporary file
-                tmp_filepath = self.tmp_audio_folder / f"temp_audio_{time.time()}.wav"
-                wf = wave.open(str(tmp_filepath), 'wb')
-                wf.setnchannels(self.CHANNELS)
-                wf.setsampwidth(audio.get_sample_size(self.FORMAT))
-                wf.setframerate(self.RATE)
-                wf.writeframes(b''.join(frames))
-                wf.close()
-
-                # Transcribe audio
-                res = self.whisper.transcribe(str(tmp_filepath))
-                transcription = self.whisper.extract_text(res)
-                os.remove(tmp_filepath)
-
-                if isinstance(transcription, list):
-                    # Remove all "[BLANK_AUDIO]" from the transcription
-                    transcription = " ".join(transcription)
-                    transcription = re.sub(r"\[BLANK_AUDIO\]", "", transcription)
-
-                if transcription.strip():  # Only emit if there's non-empty transcription
-                    self.finished.emit(transcription)
+            self.recording = True
+
+            while not self.stop_listening and (time.time() - start_time) < self.MAX_RECORDING_TIME:
+                data = stream.read(self.CHUNK)
+                frames.append(data)
+
+            self.recording = False
+            print("Stopped recording. Processing audio...")
+
+            # Save audio data to temporary file
+            tmp_filepath = self.tmp_audio_folder / f"temp_audio_{time.time()}.wav"
+            wf = wave.open(str(tmp_filepath), 'wb')
+            wf.setnchannels(self.CHANNELS)
+            wf.setsampwidth(audio.get_sample_size(self.FORMAT))
+            wf.setframerate(self.RATE)
+            wf.writeframes(b''.join(frames))
+            wf.close()
+
+            # Transcribe audio
+            res = self.whisper.transcribe(str(tmp_filepath))
+            transcription = self.whisper.extract_text(res)
+            os.remove(tmp_filepath)
+
+            if isinstance(transcription, list):
+                # Remove all "[BLANK_AUDIO]" from the transcription
+                transcription = " ".join(transcription)
+                transcription = re.sub(r"\[BLANK_AUDIO\]", "", transcription)
+
+
+            if transcription.strip():  # Only emit if there's non-empty transcription
+                self.finished.emit(transcription)
 
         except Exception as e:
             self.error.emit(f"An error occurred: {str(e)}")
@@ -123,12 +133,15 @@ def start_recognition(self):
 
         def stop_recognition(self):
             self.thread.stop()
-            self.status_label.setText("Always-on speech recognition stopped")
-            self.start_button.setEnabled(True)
+            self.status_label.setText("Processing recorded audio...")
+            self.start_button.setEnabled(False)
             self.stop_button.setEnabled(False)
 
         def on_finished(self, text):
             self.transcription_label.setText(f"Transcription: {text}")
+            self.status_label.setText("Speech recognition completed")
+            self.start_button.setEnabled(True)
+            self.stop_button.setEnabled(False)
 
         def on_error(self, error_message):
             self.status_label.setText(f"Error: {error_message}")