Spaces:

AndroidGuy
/

Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on May 24

Commit

42eafc4

1 Parent(s): b37c0fc

Code fixing

Browse files

Files changed (1) hide show

app.py +244 -190

app.py CHANGED Viewed

@@ -8,14 +8,20 @@ import os
 import urllib.request
 import torchaudio
 from scipy.spatial.distance import cosine
 import json
 import io
 import wave
-from fastrtc import Stream, ReplyOnPause, AsyncStreamHandler, get_stt_model
 # Simplified configuration parameters
 SILENCE_THRESHS = [0, 0.4]
-FINAL_TRANSCRIPTION_MODEL = "moonshine/base"  # Using FastRTC's moonshine model
 SILERO_SENSITIVITY = 0.4
 WEBRTC_SENSITIVITY = 3
 MIN_LENGTH_OF_RECORDING = 0.7
@@ -267,65 +273,12 @@ class SpeakerChangeDetector:
         }
-class DiarizationStreamHandler(AsyncStreamHandler):
-    """FastRTC stream handler for real-time diarization"""
-    def __init__(self, diarization_system):
-        super().__init__(input_sample_rate=16000)
-        self.diarization_system = diarization_system
-        self.stt_model = get_stt_model(model=FINAL_TRANSCRIPTION_MODEL)
-        self.current_text = ""
-        self.current_audio_buffer = []
-        self.transcript_queue = queue.Queue()
-    def copy(self):
-        return DiarizationStreamHandler(self.diarization_system)
-    async def start_up(self):
-        """Initialize the stream handler"""
-        pass
-    async def receive(self, frame):
-        """Process incoming audio frame"""
-        # Extract audio data
-        sample_rate, audio_data = frame
-        # Convert to numpy array if needed
-        if isinstance(audio_data, torch.Tensor):
-            audio_data = audio_data.numpy()
-        # Add to buffer
-        self.current_audio_buffer.append(audio_data)
-        # If buffer is large enough, process it
-        if len(self.current_audio_buffer) > 3:  # Process ~1.5 seconds of audio
-            # Concatenate audio data
-            combined_audio = np.concatenate(self.current_audio_buffer)
-            # Run speech-to-text
-            text = self.stt_model.stt((16000, combined_audio))
-            if text and text.strip():
-                # Save text and audio for processing
-                self.transcript_queue.put((text, combined_audio))
-                self.current_text = text
-            # Reset buffer but keep some overlap
-            if len(self.current_audio_buffer) > 5:
-                self.current_audio_buffer = self.current_audio_buffer[-2:]
-    async def emit(self):
-        """Emit processed data"""
-        # Return current text as dummy; actual processing is done in background
-        return self.current_text
 class RealtimeSpeakerDiarization:
     def __init__(self):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
-        self.stream = None
-        self.stream_handler = None
         self.sentence_queue = queue.Queue()
         self.full_sentences = []
         self.sentence_speakers = []
@@ -335,6 +288,7 @@ class RealtimeSpeakerDiarization:
         self.is_running = False
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
     def initialize_models(self):
         """Initialize the speaker encoder model"""
@@ -361,69 +315,45 @@ class RealtimeSpeakerDiarization:
             print(f"Model initialization error: {e}")
             return False
-    def start_stream(self, app):
-        """Start the FastRTC stream"""
-        if self.encoder is None:
-            return "Please initialize models first!"
-        try:
-            # Create a FastRTC stream handler
-            self.stream_handler = DiarizationStreamHandler(self)
-            # Create FastRTC stream
-            self.stream = Stream(
-                handler=self.stream_handler,
-                modality="audio",
-                mode="send-receive"
             )
-            # Mount the stream to the provided FastAPI app
-            self.stream.mount(app)
-            # Start sentence processing thread
-            self.is_running = True
-            self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
-            self.sentence_thread.start()
-            # Start diarization processor thread
-            self.diarization_thread = threading.Thread(target=self.process_transcript_queue, daemon=True)
-            self.diarization_thread.start()
-            return "Stream started successfully! Ready for audio input."
-        except Exception as e:
-            return f"Error starting stream: {e}"
-    def process_transcript_queue(self):
-        """Process transcripts from the stream handler"""
-        while self.is_running:
             try:
-                if self.stream_handler and not self.stream_handler.transcript_queue.empty():
-                    text, audio_data = self.stream_handler.transcript_queue.get(timeout=1)
-                    # Add to sentence queue for diarization
-                    self.pending_sentences.append(text)
-                    self.sentence_queue.put((text, audio_data))
-            except queue.Empty:
-                time.sleep(0.1)  # Short sleep to prevent CPU hogging
             except Exception as e:
-                print(f"Error processing transcript queue: {e}")
-                time.sleep(0.5)  # Slightly longer sleep on error
     def process_sentence_queue(self):
         """Process sentences in the queue for speaker detection"""
         while self.is_running:
             try:
-                text, audio_data = self.sentence_queue.get(timeout=1)
                 # Convert audio data to int16
-                if isinstance(audio_data, np.ndarray):
-                    if audio_data.dtype != np.int16:
-                        audio_int16 = (audio_data * 32767).astype(np.int16)
-                    else:
-                        audio_int16 = audio_data
-                else:
-                    audio_int16 = np.int16(audio_data * 32767)
                 # Extract speaker embedding
                 speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
@@ -442,16 +372,73 @@ class RealtimeSpeakerDiarization:
                 # Remove from pending
                 if text in self.pending_sentences:
                     self.pending_sentences.remove(text)
             except queue.Empty:
                 continue
             except Exception as e:
                 print(f"Error processing sentence: {e}")
-    def stop_stream(self):
-        """Stop the stream and processing"""
         self.is_running = False
-        return "Stream stopped!"
     def clear_conversation(self):
         """Clear all conversation data"""
@@ -460,6 +447,7 @@ class RealtimeSpeakerDiarization:
         self.pending_sentences = []
         self.displayed_text = ""
         self.last_realtime_text = ""
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
@@ -491,6 +479,7 @@ class RealtimeSpeakerDiarization:
                 sentence_text, _ = sentence
                 if i >= len(self.sentence_speakers):
                     color = "#FFFFFF"
                 else:
                     speaker_id = self.sentence_speakers[i]
                     color = self.speaker_detector.get_color_for_speaker(speaker_id)
@@ -539,38 +528,130 @@ class RealtimeSpeakerDiarization:
         except Exception as e:
             return f"Error getting status: {e}"
 # Global instance
 diarization_system = RealtimeSpeakerDiarization()
-# Create Gradio interface with FastAPI app integrated
 def create_interface():
-    app = gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Monochrome())
-    with app:
         gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
-        gr.Markdown("This app performs real-time speech recognition with automatic speaker identification and color-coding using FastRTC.")
         with gr.Row():
             with gr.Column(scale=2):
                 # Main conversation display
                 conversation_output = gr.HTML(
-                    value="<i>Click 'Initialize System' and then 'Start Stream' to begin...</i>",
                     label="Live Conversation"
                 )
-                # FastRTC microphone widget for visualization only (the real audio comes through FastRTC stream)
-                audio_widget = gr.Audio(
-                    label="🎙️ Microphone Input (Click Start Stream to enable)",
-                    type="microphone"
-                )
                 # Control buttons
                 with gr.Row():
                     init_btn = gr.Button("🔧 Initialize System", variant="secondary")
-                    start_btn = gr.Button("🎙️ Start Stream", variant="primary", interactive=False)
-                    stop_btn = gr.Button("⏹️ Stop Stream", variant="stop", interactive=False)
                     clear_btn = gr.Button("🗑️ Clear Conversation", interactive=False)
                 # Status display
@@ -608,28 +689,12 @@ def create_interface():
                 gr.Markdown("## 📝 Instructions")
                 gr.Markdown("""
                 1. Click **Initialize System** to load models
-                2. Click **Start Stream** to begin processing
-                3. Allow microphone access when prompted
-                4. Speak into your microphone
-                5. Watch real-time transcription with speaker labels
-                6. Adjust settings as needed
-                """)
-                # QR code for mobile access
-                gr.Markdown("## 📱 Mobile Access")
-                gr.Markdown("Scan this QR code to access from mobile device:")
-                qr_code = gr.HTML("""
-                <div id="qrcode" style="text-align: center;"></div>
-                <script src="https://cdn.jsdelivr.net/npm/[email protected]/qrcode.min.js"></script>
-                <script>
-                  setTimeout(function() {
-                    var currentUrl = window.location.href;
-                    var qr = qrcode(0, 'M');
-                    qr.addData(currentUrl);
-                    qr.make();
-                    document.getElementById('qrcode').innerHTML = qr.createImgTag(5);
-                  }, 1000);
-                </script>
                 """)
                 # Speaker color legend
@@ -639,10 +704,17 @@ def create_interface():
                     color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
                 gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
         # Auto-refresh conversation and status
         def refresh_display():
-            return get_formatted_conversation(), get_status()
         # Event handlers
         def on_initialize():
@@ -652,7 +724,7 @@ def create_interface():
                     result,
                     gr.update(interactive=True),   # start_btn
                     gr.update(interactive=True),   # clear_btn
-                    get_formatted_conversation(),
                     get_status()
                 )
             else:
@@ -660,58 +732,26 @@ def create_interface():
                     result,
                     gr.update(interactive=False),  # start_btn
                     gr.update(interactive=False),  # clear_btn
-                    get_formatted_conversation(),
                     get_status()
                 )
-        def on_start_stream():
-            result = start_stream(app)
             return (
                 result,
                 gr.update(interactive=False),  # start_btn
                 gr.update(interactive=True),   # stop_btn
             )
-        def on_stop_stream():
-            result = stop_stream()
             return (
                 result,
                 gr.update(interactive=True),   # start_btn
                 gr.update(interactive=False),  # stop_btn
             )
-        def initialize_system():
-            """Initialize the diarization system"""
-            success = diarization_system.initialize_models()
-            if success:
-                return "✅ System initialized successfully! Models loaded."
-            else:
-                return "❌ Failed to initialize system. Please check the logs."
-        def start_stream(app):
-            """Start the FastRTC stream"""
-            return diarization_system.start_stream(app)
-        def stop_stream():
-            """Stop the FastRTC stream"""
-            return diarization_system.stop_stream()
-        def clear_conversation():
-            """Clear the conversation"""
-            return diarization_system.clear_conversation()
-        def update_settings(threshold, max_speakers):
-            """Update system settings"""
-            return diarization_system.update_settings(threshold, max_speakers)
-        def get_formatted_conversation():
-            """Get the current conversation"""
-            return diarization_system.get_formatted_conversation()
-        def get_status():
-            """Get system status"""
-            return diarization_system.get_status_info()
         # Connect event handlers
         init_btn.click(
             on_initialize,
@@ -719,12 +759,12 @@ def create_interface():
         )
         start_btn.click(
-            on_start_stream,
             outputs=[status_output, start_btn, stop_btn]
         )
         stop_btn.click(
-            on_stop_stream,
             outputs=[status_output, start_btn, stop_btn]
         )
@@ -739,7 +779,7 @@ def create_interface():
             outputs=[status_output]
         )
-        # Auto-refresh every 2 seconds when streaming
         refresh_timer = gr.Timer(2.0)
         refresh_timer.tick(
             refresh_display,
@@ -749,10 +789,24 @@ def create_interface():
     return app
-if __name__ == "__main__":
     app = create_interface()
     app.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=True
     )

 import urllib.request
 import torchaudio
 from scipy.spatial.distance import cosine
+from RealtimeSTT import AudioToTextRecorder
+from fastrtc import Stream, AsyncStreamHandler, ReplyOnPause
 import json
 import io
 import wave
+import asyncio
 # Simplified configuration parameters
 SILENCE_THRESHS = [0, 0.4]
+FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
+FINAL_BEAM_SIZE = 5
+REALTIME_TRANSCRIPTION_MODEL = "distil-small.en"
+REALTIME_BEAM_SIZE = 5
+TRANSCRIPTION_LANGUAGE = "en"
 SILERO_SENSITIVITY = 0.4
 WEBRTC_SENSITIVITY = 3
 MIN_LENGTH_OF_RECORDING = 0.7
         }
 class RealtimeSpeakerDiarization:
     def __init__(self):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
+        self.recorder = None
         self.sentence_queue = queue.Queue()
         self.full_sentences = []
         self.sentence_speakers = []
         self.is_running = False
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
+        self.current_conversation = ""
     def initialize_models(self):
         """Initialize the speaker encoder model"""
             print(f"Model initialization error: {e}")
             return False
+    def live_text_detected(self, text):
+        """Callback for real-time transcription updates"""
+        text = text.strip()
+        if text:
+            sentence_delimiters = '.?!。'
+            prob_sentence_end = (
+                len(self.last_realtime_text) > 0
+                and text[-1] in sentence_delimiters
+                and self.last_realtime_text[-1] in sentence_delimiters
             )
+            self.last_realtime_text = text
+            if prob_sentence_end and FAST_SENTENCE_END:
+                self.recorder.stop()
+            elif prob_sentence_end:
+                self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
+            else:
+                self.recorder.post_speech_silence_duration = SILENCE_THRESHS[1]
+    def process_final_text(self, text):
+        """Process final transcribed text with speaker embedding"""
+        text = text.strip()
+        if text:
             try:
+                bytes_data = self.recorder.last_transcription_bytes
+                self.sentence_queue.put((text, bytes_data))
+                self.pending_sentences.append(text)
             except Exception as e:
+                print(f"Error processing final text: {e}")
     def process_sentence_queue(self):
         """Process sentences in the queue for speaker detection"""
         while self.is_running:
             try:
+                text, bytes_data = self.sentence_queue.get(timeout=1)
                 # Convert audio data to int16
+                audio_int16 = np.frombuffer(bytes_data, dtype=np.int16)
                 # Extract speaker embedding
                 speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
                 # Remove from pending
                 if text in self.pending_sentences:
                     self.pending_sentences.remove(text)
+                # Update conversation display
+                self.current_conversation = self.get_formatted_conversation()
             except queue.Empty:
                 continue
             except Exception as e:
                 print(f"Error processing sentence: {e}")
+    def start_recording(self):
+        """Start the recording and transcription process"""
+        if self.encoder is None:
+            return "Please initialize models first!"
+        try:
+            # Setup recorder configuration for WebRTC input
+            recorder_config = {
+                'spinner': False,
+                'use_microphone': False,  # We'll feed audio manually
+                'model': FINAL_TRANSCRIPTION_MODEL,
+                'language': TRANSCRIPTION_LANGUAGE,
+                'silero_sensitivity': SILERO_SENSITIVITY,
+                'webrtc_sensitivity': WEBRTC_SENSITIVITY,
+                'post_speech_silence_duration': SILENCE_THRESHS[1],
+                'min_length_of_recording': MIN_LENGTH_OF_RECORDING,
+                'pre_recording_buffer_duration': PRE_RECORDING_BUFFER_DURATION,
+                'min_gap_between_recordings': 0,
+                'enable_realtime_transcription': True,
+                'realtime_processing_pause': 0,
+                'realtime_model_type': REALTIME_TRANSCRIPTION_MODEL,
+                'on_realtime_transcription_update': self.live_text_detected,
+                'beam_size': FINAL_BEAM_SIZE,
+                'beam_size_realtime': REALTIME_BEAM_SIZE,
+                'buffer_size': BUFFER_SIZE,
+                'sample_rate': SAMPLE_RATE,
+            }
+            self.recorder = AudioToTextRecorder(**recorder_config)
+            # Start sentence processing thread
+            self.is_running = True
+            self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
+            self.sentence_thread.start()
+            # Start transcription thread
+            self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
+            self.transcription_thread.start()
+            return "Recording started successfully! FastRTC audio input ready."
+        except Exception as e:
+            return f"Error starting recording: {e}"
+    def run_transcription(self):
+        """Run the transcription loop"""
+        try:
+            while self.is_running:
+                self.recorder.text(self.process_final_text)
+        except Exception as e:
+            print(f"Transcription error: {e}")
+    def stop_recording(self):
+        """Stop the recording process"""
         self.is_running = False
+        if self.recorder:
+            self.recorder.stop()
+        return "Recording stopped!"
     def clear_conversation(self):
         """Clear all conversation data"""
         self.pending_sentences = []
         self.displayed_text = ""
         self.last_realtime_text = ""
+        self.current_conversation = "Conversation cleared!"
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
                 sentence_text, _ = sentence
                 if i >= len(self.sentence_speakers):
                     color = "#FFFFFF"
+                    speaker_name = "Unknown"
                 else:
                     speaker_id = self.sentence_speakers[i]
                     color = self.speaker_detector.get_color_for_speaker(speaker_id)
         except Exception as e:
             return f"Error getting status: {e}"
+    def process_audio(self, audio_data):
+        """Process audio data from FastRTC"""
+        if not self.is_running or not self.recorder:
+            return
+        try:
+            # Extract audio data from FastRTC format (sample_rate, numpy_array)
+            sample_rate, audio_array = audio_data
+            # Convert to int16 format
+            if audio_array.dtype != np.int16:
+                audio_array = (audio_array * 32767).astype(np.int16)
+            # Convert to bytes and feed to recorder
+            audio_bytes = audio_array.tobytes()
+            self.recorder.feed_audio(audio_bytes)
+        except Exception as e:
+            print(f"Error processing FastRTC audio: {e}")
+# FastRTC Audio Handler
+class DiarizationHandler(AsyncStreamHandler):
+    def __init__(self, diarization_system):
+        super().__init__()
+        self.diarization_system = diarization_system
+    async def emit(self):
+        """Not used in this implementation"""
+        return None
+    async def receive(self, data):
+        """Receive audio data from FastRTC and process it"""
+        if self.diarization_system.is_running:
+            self.diarization_system.process_audio(data)
 # Global instance
 diarization_system = RealtimeSpeakerDiarization()
+def initialize_system():
+    """Initialize the diarization system"""
+    success = diarization_system.initialize_models()
+    if success:
+        return "✅ System initialized successfully! Models loaded."
+    else:
+        return "❌ Failed to initialize system. Please check the logs."
+def start_recording():
+    """Start recording and transcription"""
+    return diarization_system.start_recording()
+def stop_recording():
+    """Stop recording and transcription"""
+    return diarization_system.stop_recording()
+def clear_conversation():
+    """Clear the conversation"""
+    return diarization_system.clear_conversation()
+def update_settings(threshold, max_speakers):
+    """Update system settings"""
+    return diarization_system.update_settings(threshold, max_speakers)
+def get_conversation():
+    """Get the current conversation"""
+    return diarization_system.get_formatted_conversation()
+def get_status():
+    """Get system status"""
+    return diarization_system.get_status_info()
+# Setup FastRTC stream handler
+def setup_fastrtc_handler():
+    """Set up FastRTC audio stream handler"""
+    handler = DiarizationHandler(diarization_system)
+    stream = Stream(handler=handler, modality="audio", mode="receive")
+    return stream
+# Create Gradio interface
 def create_interface():
+    with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Monochrome()) as app:
         gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
+        gr.Markdown("This app performs real-time speech recognition with automatic speaker identification and color-coding.")
         with gr.Row():
             with gr.Column(scale=2):
+                # FastRTC Audio Component
+                fastrtc_html = gr.HTML("""
+                <div class="fastrtc-container" style="margin-bottom: 20px;">
+                    <h3>🎙️ FastRTC Audio Input</h3>
+                    <p>Click the button below to start the audio stream:</p>
+                    <button id="start-fastrtc" style="background: #3498db; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer;">
+                        Start FastRTC Audio
+                    </button>
+                    <div id="fastrtc-status" style="margin-top: 10px; font-style: italic;">Not connected</div>
+                    <script>
+                        document.getElementById('start-fastrtc').addEventListener('click', function() {
+                            document.getElementById('fastrtc-status').textContent = 'Connecting...';
+                            // FastRTC will be initialized here by the middleware
+                        });
+                    </script>
+                </div>
+                """)
                 # Main conversation display
                 conversation_output = gr.HTML(
+                    value="<i>Click 'Initialize System' to start...</i>",
                     label="Live Conversation"
                 )
                 # Control buttons
                 with gr.Row():
                     init_btn = gr.Button("🔧 Initialize System", variant="secondary")
+                    start_btn = gr.Button("🎙️ Start Recording", variant="primary", interactive=False)
+                    stop_btn = gr.Button("⏹️ Stop Recording", variant="stop", interactive=False)
                     clear_btn = gr.Button("🗑️ Clear Conversation", interactive=False)
                 # Status display
                 gr.Markdown("## 📝 Instructions")
                 gr.Markdown("""
                 1. Click **Initialize System** to load models
+                2. Click **Start Recording** to begin processing
+                3. Click **Start FastRTC Audio** to connect your microphone
+                4. Allow microphone access when prompted
+                5. Speak into your microphone
+                6. Watch real-time transcription with speaker labels
+                7. Adjust settings as needed
                 """)
                 # Speaker color legend
                     color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
                 gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
+                # FastRTC Integration Notice
+                gr.Markdown("""
+                ## ℹ️ About FastRTC
+                This app uses FastRTC for low-latency audio streaming.
+                For optimal performance, use a modern browser and allow microphone access when prompted.
+                """)
         # Auto-refresh conversation and status
         def refresh_display():
+            return diarization_system.get_formatted_conversation(), diarization_system.get_status_info()
         # Event handlers
         def on_initialize():
                     result,
                     gr.update(interactive=True),   # start_btn
                     gr.update(interactive=True),   # clear_btn
+                    get_conversation(),
                     get_status()
                 )
             else:
                     result,
                     gr.update(interactive=False),  # start_btn
                     gr.update(interactive=False),  # clear_btn
+                    get_conversation(),
                     get_status()
                 )
+        def on_start():
+            result = start_recording()
             return (
                 result,
                 gr.update(interactive=False),  # start_btn
                 gr.update(interactive=True),   # stop_btn
             )
+        def on_stop():
+            result = stop_recording()
             return (
                 result,
                 gr.update(interactive=True),   # start_btn
                 gr.update(interactive=False),  # stop_btn
             )
         # Connect event handlers
         init_btn.click(
             on_initialize,
         )
         start_btn.click(
+            on_start,
             outputs=[status_output, start_btn, stop_btn]
         )
         stop_btn.click(
+            on_stop,
             outputs=[status_output, start_btn, stop_btn]
         )
             outputs=[status_output]
         )
+        # Auto-refresh every 2 seconds when recording
         refresh_timer = gr.Timer(2.0)
         refresh_timer.tick(
             refresh_display,
     return app
+async def main():
+    # Setup FastRTC stream
+    stream = setup_fastrtc_handler()
+    # Create Gradio app
     app = create_interface()
+    # Mount FastRTC stream to the Gradio app
+    stream.mount(app)
+    # Launch the app
     app.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=True
     )
+if __name__ == "__main__":
+    # Run the async application
+    asyncio.run(main())