Spaces:

AndroidGuy
/

Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on May 25

Commit

9a6051e

1 Parent(s): 3466e71

Check point 2

Browse files

Files changed (1) hide show

app.py +458 -920

app.py CHANGED Viewed

@@ -10,15 +10,17 @@ import torchaudio
 from scipy.spatial.distance import cosine
 from RealtimeSTT import AudioToTextRecorder
 from fastapi import FastAPI, APIRouter
-from fastrtc import Stream, AsyncStreamHandler, ReplyOnPause, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials
 import json
-import io
-import wave
 import asyncio
 import uvicorn
-import socket
 from queue import Queue
-import time
 # Simplified configuration parameters
 SILENCE_THRESHS = [0, 0.4]
 FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
@@ -32,35 +34,31 @@ MIN_LENGTH_OF_RECORDING = 0.7
 PRE_RECORDING_BUFFER_DURATION = 0.35
 # Speaker change detection parameters
-DEFAULT_CHANGE_THRESHOLD = 0.7
 EMBEDDING_HISTORY_SIZE = 5
-MIN_SEGMENT_DURATION = 1.0
 DEFAULT_MAX_SPEAKERS = 4
-ABSOLUTE_MAX_SPEAKERS = 10
 # Global variables
-FAST_SENTENCE_END = True
 SAMPLE_RATE = 16000
-BUFFER_SIZE = 512
 CHANNELS = 1
-# Speaker colors
 SPEAKER_COLORS = [
-    "#FFFF00",  # Yellow
-    "#FF0000",  # Red
-    "#00FF00",  # Green
-    "#00FFFF",  # Cyan
-    "#FF00FF",  # Magenta
-    "#0000FF",  # Blue
-    "#FF8000",  # Orange
-    "#00FF80",  # Spring Green
-    "#8000FF",  # Purple
-    "#FFFFFF",  # White
 ]
 SPEAKER_COLOR_NAMES = [
-    "Yellow", "Red", "Green", "Cyan", "Magenta",
-    "Blue", "Orange", "Spring Green", "Purple", "White"
 ]
@@ -74,24 +72,11 @@ class SpeechBrainEncoder:
         self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
         os.makedirs(self.cache_dir, exist_ok=True)
-    def _download_model(self):
-        """Download pre-trained SpeechBrain ECAPA-TDNN model if not present"""
-        model_url = "https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb/resolve/main/embedding_model.ckpt"
-        model_path = os.path.join(self.cache_dir, "embedding_model.ckpt")
-        if not os.path.exists(model_path):
-            print(f"Downloading ECAPA-TDNN model to {model_path}...")
-            urllib.request.urlretrieve(model_url, model_path)
-        return model_path
     def load_model(self):
         """Load the ECAPA-TDNN model"""
         try:
             from speechbrain.pretrained import EncoderClassifier
-            model_path = self._download_model()
             self.model = EncoderClassifier.from_hparams(
                 source="speechbrain/spkrec-ecapa-voxceleb",
                 savedir=self.cache_dir,
@@ -99,9 +84,10 @@ class SpeechBrainEncoder:
             )
             self.model_loaded = True
             return True
         except Exception as e:
-            print(f"Error loading ECAPA-TDNN model: {e}")
             return False
     def embed_utterance(self, audio, sr=16000):
@@ -111,10 +97,15 @@ class SpeechBrainEncoder:
         try:
             if isinstance(audio, np.ndarray):
-                waveform = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
             else:
                 waveform = audio.unsqueeze(0)
             if sr != 16000:
                 waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
@@ -123,7 +114,7 @@ class SpeechBrainEncoder:
             return embedding.squeeze().cpu().numpy()
         except Exception as e:
-            print(f"Error extracting embedding: {e}")
             return np.zeros(self.embedding_dim)
@@ -131,41 +122,60 @@ class AudioProcessor:
     """Processes audio data to extract speaker embeddings"""
     def __init__(self, encoder):
         self.encoder = encoder
-    def extract_embedding(self, audio_int16):
-        try:
-            float_audio = audio_int16.astype(np.float32) / 32768.0
-            if np.abs(float_audio).max() > 1.0:
-                float_audio = float_audio / np.abs(float_audio).max()
-            embedding = self.encoder.embed_utterance(float_audio)
             return embedding
         except Exception as e:
-            print(f"Embedding extraction error: {e}")
-            return np.zeros(self.encoder.embedding_dim)
 class SpeakerChangeDetector:
-    """Speaker change detector that supports a configurable number of speakers"""
     def __init__(self, embedding_dim=192, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
         self.embedding_dim = embedding_dim
         self.change_threshold = change_threshold
         self.max_speakers = min(max_speakers, ABSOLUTE_MAX_SPEAKERS)
         self.current_speaker = 0
-        self.previous_embeddings = []
-        self.last_change_time = time.time()
-        self.mean_embeddings = [None] * self.max_speakers
         self.speaker_embeddings = [[] for _ in range(self.max_speakers)]
-        self.last_similarity = 0.0
         self.active_speakers = set([0])
     def set_max_speakers(self, max_speakers):
         """Update the maximum number of speakers"""
         new_max = min(max_speakers, ABSOLUTE_MAX_SPEAKERS)
         if new_max < self.max_speakers:
             for speaker_id in list(self.active_speakers):
                 if speaker_id >= new_max:
                     self.active_speakers.discard(speaker_id)
@@ -173,85 +183,85 @@ class SpeakerChangeDetector:
             if self.current_speaker >= new_max:
                 self.current_speaker = 0
         if new_max > self.max_speakers:
-            self.mean_embeddings.extend([None] * (new_max - self.max_speakers))
             self.speaker_embeddings.extend([[] for _ in range(new_max - self.max_speakers)])
         else:
-            self.mean_embeddings = self.mean_embeddings[:new_max]
             self.speaker_embeddings = self.speaker_embeddings[:new_max]
         self.max_speakers = new_max
     def set_change_threshold(self, threshold):
         """Update the threshold for detecting speaker changes"""
-        self.change_threshold = max(0.1, min(threshold, 0.99))
     def add_embedding(self, embedding, timestamp=None):
-        """Add a new embedding and check if there's a speaker change"""
         current_time = timestamp or time.time()
-        if not self.previous_embeddings:
-            self.previous_embeddings.append(embedding)
-            self.speaker_embeddings[self.current_speaker].append(embedding)
-            if self.mean_embeddings[self.current_speaker] is None:
-                self.mean_embeddings[self.current_speaker] = embedding.copy()
-            return self.current_speaker, 1.0
-        current_mean = self.mean_embeddings[self.current_speaker]
-        if current_mean is not None:
-            similarity = 1.0 - cosine(embedding, current_mean)
         else:
-            similarity = 1.0 - cosine(embedding, self.previous_embeddings[-1])
         self.last_similarity = similarity
         time_since_last_change = current_time - self.last_change_time
-        is_speaker_change = False
-        if time_since_last_change >= MIN_SEGMENT_DURATION:
-            if similarity < self.change_threshold:
-                best_speaker = self.current_speaker
-                best_similarity = similarity
-                for speaker_id in range(self.max_speakers):
-                    if speaker_id == self.current_speaker:
-                        continue
-                    speaker_mean = self.mean_embeddings[speaker_id]
-                    if speaker_mean is not None:
-                        speaker_similarity = 1.0 - cosine(embedding, speaker_mean)
-                        if speaker_similarity > best_similarity:
-                            best_similarity = speaker_similarity
-                            best_speaker = speaker_id
-                if best_speaker != self.current_speaker:
-                    is_speaker_change = True
-                    self.current_speaker = best_speaker
-                elif len(self.active_speakers) < self.max_speakers:
-                    for new_id in range(self.max_speakers):
-                        if new_id not in self.active_speakers:
-                            is_speaker_change = True
-                            self.current_speaker = new_id
-                            self.active_speakers.add(new_id)
-                            break
-        if is_speaker_change:
-            self.last_change_time = current_time
-        self.previous_embeddings.append(embedding)
-        if len(self.previous_embeddings) > EMBEDDING_HISTORY_SIZE:
-            self.previous_embeddings.pop(0)
         self.speaker_embeddings[self.current_speaker].append(embedding)
-        self.active_speakers.add(self.current_speaker)
-        if len(self.speaker_embeddings[self.current_speaker]) > 30:
-            self.speaker_embeddings[self.current_speaker] = self.speaker_embeddings[self.current_speaker][-30:]
         if self.speaker_embeddings[self.current_speaker]:
-            self.mean_embeddings[self.current_speaker] = np.mean(
                 self.speaker_embeddings[self.current_speaker], axis=0
             )
@@ -264,7 +274,7 @@ class SpeakerChangeDetector:
         return "#FFFFFF"
     def get_status_info(self):
-        """Return status information about the speaker change detector"""
         speaker_counts = [len(self.speaker_embeddings[i]) for i in range(self.max_speakers)]
         return {
@@ -273,7 +283,8 @@ class SpeakerChangeDetector:
             "active_speakers": len(self.active_speakers),
             "max_speakers": self.max_speakers,
             "last_similarity": self.last_similarity,
-            "threshold": self.change_threshold
         }
@@ -283,173 +294,121 @@ class RealtimeSpeakerDiarization:
         self.audio_processor = None
         self.speaker_detector = None
         self.recorder = None
-        self.sentence_queue = queue.Queue(maxsize=100)  # Add maxsize to prevent unlimited growth
         self.full_sentences = []
         self.sentence_speakers = []
         self.pending_sentences = []
-        self.displayed_text = ""
-        self.last_realtime_text = ""
         self.is_running = False
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
-        self.current_conversation = ""
-        self.audio_buffer = []
-        # Add locks for thread safety
-        self._state_lock = threading.RLock()  # Reentrant lock for shared state
-        self._audio_lock = threading.Lock()   # Lock for audio processing
     def initialize_models(self):
         """Initialize the speaker encoder model"""
         try:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
-            print(f"Using device: {device_str}")
             self.encoder = SpeechBrainEncoder(device=device_str)
-            # Try to load model with timeout
-            import threading
-            load_success = [False]
-            def load_model_thread():
-                try:
-                    success = self.encoder.load_model()
-                    load_success[0] = success
-                except Exception as e:
-                    print(f"Error in model loading thread: {e}")
-            # Start loading in a thread with timeout
-            load_thread = threading.Thread(target=load_model_thread)
-            load_thread.daemon = True
-            load_thread.start()
-            load_thread.join(timeout=60)  # 60 second timeout for model loading
-            if load_success[0]:
                 self.audio_processor = AudioProcessor(self.encoder)
                 self.speaker_detector = SpeakerChangeDetector(
                     embedding_dim=self.encoder.embedding_dim,
                     change_threshold=self.change_threshold,
                     max_speakers=self.max_speakers
                 )
-                print("ECAPA-TDNN model loaded successfully!")
                 return True
             else:
-                print("Failed to load ECAPA-TDNN model or timeout occurred")
-                return self._initialize_fallback()
-        except Exception as e:
-            print(f"Model initialization error: {e}")
-            import traceback
-            traceback.print_exc()
-            return self._initialize_fallback()
-    def _initialize_fallback(self):
-        """Initialize fallback mode when model loading fails"""
-        try:
-            print("Initializing fallback mode with simple speaker detection...")
-            # Create a simple embedding dimension
-            embedding_dim = 64
-            # Create a dummy encoder that produces random embeddings
-            class DummyEncoder:
-                def __init__(self):
-                    self.embedding_dim = embedding_dim
-                    self.model_loaded = True
-                def embed_utterance(self, audio, sr=16000):
-                    # Simple energy-based pseudo-embedding
-                    if isinstance(audio, np.ndarray):
-                        # Create a simple feature vector (not a real embedding)
-                        energy = np.mean(np.abs(audio))
-                        # Create a pseudo-random but consistent embedding based on audio energy
-                        np.random.seed(int(energy * 1000))
-                        return np.random.rand(embedding_dim)
-                    return np.random.rand(embedding_dim)
-            # Set up system with fallback components
-            self.encoder = DummyEncoder()
-            self.audio_processor = AudioProcessor(self.encoder)
-            self.speaker_detector = SpeakerChangeDetector(
-                embedding_dim=embedding_dim,
-                change_threshold=self.change_threshold,
-                max_speakers=2  # Limit speakers in fallback mode
-            )
-            print("Fallback mode initialized - limited functionality!")
-            return True
         except Exception as e:
-            print(f"Even fallback initialization failed: {e}")
             return False
     def live_text_detected(self, text):
         """Callback for real-time transcription updates"""
-        text = text.strip()
-        if text:
-            sentence_delimiters = '.?!。'
-            prob_sentence_end = (
-                len(self.last_realtime_text) > 0
-                and text[-1] in sentence_delimiters
-                and self.last_realtime_text[-1] in sentence_delimiters
-            )
-            self.last_realtime_text = text
-            if prob_sentence_end and FAST_SENTENCE_END:
-                self.recorder.stop()
-            elif prob_sentence_end:
-                self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
-            else:
-                self.recorder.post_speech_silence_duration = SILENCE_THRESHS[1]
     def process_final_text(self, text):
         """Process final transcribed text with speaker embedding"""
         text = text.strip()
         if text:
             try:
-                bytes_data = self.recorder.last_transcription_bytes
-                self.sentence_queue.put((text, bytes_data), timeout=1.0)  # Added timeout
-                with self._state_lock:
-                    self.pending_sentences.append(text)
             except Exception as e:
-                print(f"Error processing final text: {e}")
     def process_sentence_queue(self):
         """Process sentences in the queue for speaker detection"""
         while self.is_running:
             try:
-                text, bytes_data = self.sentence_queue.get(timeout=1)
-                # Convert audio data to int16
-                audio_int16 = np.frombuffer(bytes_data, dtype=np.int16)
-                # Extract speaker embedding
-                speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
-                with self._state_lock:
-                    # Store sentence and embedding
-                    self.full_sentences.append((text, speaker_embedding))
-                    # Fill in missing speaker assignments
-                    while len(self.sentence_speakers) < len(self.full_sentences) - 1:
-                        self.sentence_speakers.append(0)
-                    # Detect speaker changes
-                    speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
-                    self.sentence_speakers.append(speaker_id)
-                    # Remove from pending
-                    if text in self.pending_sentences:
-                        self.pending_sentences.remove(text)
-                    # Update conversation display
-                    self.current_conversation = self.get_formatted_conversation()
             except queue.Empty:
                 continue
             except Exception as e:
-                print(f"Error processing sentence: {e}")
-                import traceback
-                traceback.print_exc()
     def start_recording(self):
         """Start the recording and transcription process"""
@@ -457,10 +416,10 @@ class RealtimeSpeakerDiarization:
             return "Please initialize models first!"
         try:
-            # Setup recorder configuration for manual audio input
             recorder_config = {
                 'spinner': False,
-                'use_microphone': False,  # We'll feed audio manually
                 'model': FINAL_TRANSCRIPTION_MODEL,
                 'language': TRANSCRIPTION_LANGUAGE,
                 'silero_sensitivity': SILERO_SENSITIVITY,
@@ -470,45 +429,29 @@ class RealtimeSpeakerDiarization:
                 'pre_recording_buffer_duration': PRE_RECORDING_BUFFER_DURATION,
                 'min_gap_between_recordings': 0,
                 'enable_realtime_transcription': True,
-                'realtime_processing_pause': 0,
                 'realtime_model_type': REALTIME_TRANSCRIPTION_MODEL,
                 'on_realtime_transcription_update': self.live_text_detected,
                 'beam_size': FINAL_BEAM_SIZE,
                 'beam_size_realtime': REALTIME_BEAM_SIZE,
-                'buffer_size': BUFFER_SIZE,
                 'sample_rate': SAMPLE_RATE,
-                'external_audio': True,  # Signal that we'll provide audio
             }
-            # Make sure we're not running already
-            if hasattr(self, 'is_running') and self.is_running:
-                self.stop_recording()
-                # Short pause to ensure cleanup completes
-                time.sleep(0.5)
             self.recorder = AudioToTextRecorder(**recorder_config)
-            # Reset state
-            with self._state_lock:
-                self.pending_sentences = []
-                self.last_realtime_text = ""
-            # Start sentence processing thread
             self.is_running = True
             self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
             self.sentence_thread.start()
-            # Start transcription thread
             self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
             self.transcription_thread.start()
-            return "Recording started successfully! FastRTC audio input ready."
         except Exception as e:
-            self.is_running = False
-            import traceback
-            traceback.print_exc()
-            return f"Error starting recording: {str(e)}"
     def run_transcription(self):
         """Run the transcription loop"""
@@ -516,63 +459,21 @@ class RealtimeSpeakerDiarization:
             while self.is_running:
                 self.recorder.text(self.process_final_text)
         except Exception as e:
-            print(f"Transcription error: {e}")
     def stop_recording(self):
         """Stop the recording process"""
         self.is_running = False
         if self.recorder:
             self.recorder.stop()
-        # Wait for threads to finish
-        self._cleanup_resources()
         return "Recording stopped!"
-    def _cleanup_resources(self):
-        """Clean up resources and threads"""
-        try:
-            # Wait for threads to stop gracefully
-            if hasattr(self, 'sentence_thread') and self.sentence_thread is not None:
-                if self.sentence_thread.is_alive():
-                    self.sentence_thread.join(timeout=3.0)
-            if hasattr(self, 'transcription_thread') and self.transcription_thread is not None:
-                if self.transcription_thread.is_alive():
-                    self.transcription_thread.join(timeout=3.0)
-            # Clean up memory
-            with self._state_lock:
-                # Limit history size to prevent memory leaks
-                if len(self.full_sentences) > 1000:
-                    self.full_sentences = self.full_sentences[-1000:]
-                if len(self.sentence_speakers) > 1000:
-                    self.sentence_speakers = self.sentence_speakers[-1000:]
-            # Clear audio buffer
-            with self._audio_lock:
-                self.audio_buffer = []
-            # Clear queue
-            while not self.sentence_queue.empty():
-                try:
-                    self.sentence_queue.get_nowait()
-                except:
-                    pass
-        except Exception as e:
-            print(f"Error during resource cleanup: {e}")
-            import traceback
-            traceback.print_exc()
     def clear_conversation(self):
         """Clear all conversation data"""
-        self.full_sentences = []
-        self.sentence_speakers = []
-        self.pending_sentences = []
-        self.displayed_text = ""
-        self.last_realtime_text = ""
-        self.current_conversation = "Conversation cleared!"
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
@@ -595,36 +496,8 @@ class RealtimeSpeakerDiarization:
         return f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
     def get_formatted_conversation(self):
-        """Get the formatted conversation with speaker colors"""
-        try:
-            sentences_with_style = []
-            # Process completed sentences
-            for i, sentence in enumerate(self.full_sentences):
-                sentence_text, _ = sentence
-                if i >= len(self.sentence_speakers):
-                    color = "#FFFFFF"
-                    speaker_name = "Unknown"
-                else:
-                    speaker_id = self.sentence_speakers[i]
-                    color = self.speaker_detector.get_color_for_speaker(speaker_id)
-                    speaker_name = f"Speaker {speaker_id + 1}"
-                sentences_with_style.append(
-                    f'<span style="color:{color};"><b>{speaker_name}:</b> {sentence_text}</span>')
-            # Add pending sentences
-            for pending_sentence in self.pending_sentences:
-                sentences_with_style.append(
-                    f'<span style="color:#60FFFF;"><b>Processing:</b> {pending_sentence}</span>')
-            if sentences_with_style:
-                return "<br><br>".join(sentences_with_style)
-            else:
-                return "Waiting for speech input..."
-        except Exception as e:
-            return f"Error formatting conversation: {e}"
     def get_status_info(self):
         """Get current status information"""
@@ -640,808 +513,473 @@ class RealtimeSpeakerDiarization:
                 f"**Last Similarity:** {status['last_similarity']:.3f}",
                 f"**Change Threshold:** {status['threshold']:.2f}",
                 f"**Total Sentences:** {len(self.full_sentences)}",
                 "",
-                "**Speaker Segment Counts:**"
             ]
             for i in range(status['max_speakers']):
                 color_name = SPEAKER_COLOR_NAMES[i] if i < len(SPEAKER_COLOR_NAMES) else f"Speaker {i+1}"
-                status_lines.append(f"Speaker {i+1} ({color_name}): {status['speaker_counts'][i]}")
             return "\n".join(status_lines)
         except Exception as e:
             return f"Error getting status: {e}"
-    def feed_audio_data(self, audio_data):
-        """Feed audio data to the recorder"""
-        if not self.is_running or not self.recorder:
-            return
-        try:
-            # Ensure audio is in the correct format (16-bit PCM)
-            if isinstance(audio_data, np.ndarray):
-                if audio_data.dtype != np.int16:
-                    # Convert float to int16
-                    if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
-                        audio_data = (audio_data * 32767).astype(np.int16)
-                    else:
-                        audio_data = audio_data.astype(np.int16)
-                # Convert to bytes
-                audio_bytes = audio_data.tobytes()
-            else:
-                audio_bytes = audio_data
-            # Use the recorder's internal buffer mechanism
-            if hasattr(self.recorder, 'feed_audio') and callable(self.recorder.feed_audio):
-                self.recorder.feed_audio(audio_bytes)
-            else:
-                # Fallback: Direct access to the underlying buffer if the method doesn't exist
-                self.audio_buffer.append(audio_bytes)
-                # Process buffered audio when enough is accumulated
-                if len(self.audio_buffer) > 5:  # Process in small batches
-                    combined = b''.join(self.audio_buffer)
-                    if hasattr(self.recorder, '_process_audio'):
-                        self.recorder._process_audio(combined)
-                    self.audio_buffer = []
-        except Exception as e:
-            print(f"Error feeding audio data: {str(e)}")
-            import traceback
-            traceback.print_exc()
     def process_audio_chunk(self, audio_data, sample_rate=16000):
         """Process audio chunk from FastRTC input"""
-        if not self.is_running or self.recorder is None:
             return
         try:
-            with self._audio_lock:
-                # Use the normalized audio function
-                audio_int16 = self._normalize_audio_format(audio_data, target_dtype=np.int16, target_sample_rate=SAMPLE_RATE)
-                # Check if we got valid audio
-                if audio_int16.size == 0:
-                    print("Warning: Empty audio chunk received")
-                    return
-                # Resample if needed
-                if sample_rate != SAMPLE_RATE:
-                    audio_int16 = self._resample_audio(audio_int16, sample_rate, SAMPLE_RATE)
-                # Convert to bytes for feeding to recorder
-                audio_bytes = audio_int16.tobytes()
-                # Feed to recorder
-                self.feed_audio_data(audio_bytes)
-        except Exception as e:
-            print(f"Error processing audio chunk: {str(e)}")
-            import traceback
-            traceback.print_exc()
-    def _resample_audio(self, audio, orig_sr, target_sr):
-        """Resample audio to target sample rate"""
-        try:
-            import scipy.signal
-            # Get the resampling ratio
-            ratio = target_sr / orig_sr
-            # Calculate the new length
-            new_length = int(len(audio[0]) * ratio)
-            # Resample the audio
-            resampled = scipy.signal.resample(audio[0], new_length)
-            # Return in the same shape format
-            return np.expand_dims(resampled, 0)
-        except Exception as e:
-            print(f"Error resampling audio: {e}")
-            return audio
-    def _normalize_audio_format(self, audio_data, target_dtype=np.int16, target_sample_rate=SAMPLE_RATE):
-        """Normalize audio data to consistent format
-        Args:
-            audio_data: Input audio as numpy array or bytes
-            target_dtype: Target data type (np.int16 or np.float32)
-            target_sample_rate: Target sample rate
-        Returns:
-            Normalized audio as numpy array in requested format
-        """
-        try:
-            # Convert bytes to numpy if needed
-            if isinstance(audio_data, bytes):
-                audio_array = np.frombuffer(audio_data, dtype=np.int16)
-            elif isinstance(audio_data, (list, tuple)):
-                audio_array = np.array(audio_data)
-            else:
-                audio_array = audio_data
-            # Convert data type as needed
-            if target_dtype == np.int16 and audio_array.dtype != np.int16:
-                if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
-                    # Check if normalized to [-1, 1] range
-                    if np.max(np.abs(audio_array)) <= 1.0:
-                        audio_array = (audio_array * 32767).astype(np.int16)
-                    else:
-                        audio_array = audio_array.astype(np.int16)
-                else:
-                    audio_array = audio_array.astype(np.int16)
-            elif target_dtype == np.float32 and audio_array.dtype != np.float32:
-                if audio_array.dtype == np.int16:
-                    audio_array = audio_array.astype(np.float32) / 32768.0
-                else:
-                    audio_array = audio_array.astype(np.float32)
-            # Ensure mono audio
-            if len(audio_array.shape) > 1 and audio_array.shape[1] > 1:
-                audio_array = np.mean(audio_array, axis=1)
-            # Reshape if needed
-            if len(audio_array.shape) == 1:
-                if target_dtype == np.int16:
-                    audio_array = np.expand_dims(audio_array, 0)
-            return audio_array
         except Exception as e:
-            print(f"Error normalizing audio format: {e}")
-            import traceback
-            traceback.print_exc()
-            # Return empty array of correct type as fallback
-            return np.array([], dtype=target_dtype)
-# FastRTC Audio Handler for Real-time Diarization
 class DiarizationHandler(AsyncStreamHandler):
     def __init__(self, diarization_system):
         super().__init__()
         self.diarization_system = diarization_system
-        self.audio_queue = asyncio.Queue(maxsize=100)  # Use asyncio queue
-        self.is_processing = False
-        self.sample_rate = 16000  # Default sample rate
-        self.processing_task = None
     def copy(self):
         """Return a fresh handler for each new stream connection"""
         return DiarizationHandler(self.diarization_system)
     async def emit(self):
-        """Not used in this implementation - we only receive audio"""
         return None
     async def receive(self, frame):
-        """Receive audio data from FastRTC and process it"""
         try:
             if not self.diarization_system.is_running:
                 return
-            # Extract audio data from frame
-            if hasattr(frame, 'data') and frame.data is not None:
-                audio_data = frame.data
-            elif hasattr(frame, 'audio') and frame.audio is not None:
-                audio_data = frame.audio
             else:
-                audio_data = frame
-            # Get sample rate from frame if available
-            sample_rate = getattr(frame, 'sample_rate', self.sample_rate)
-            # Add to queue - non-blocking with timeout
-            try:
-                # Use put_nowait with try/except to avoid blocking
-                await asyncio.wait_for(
-                    self.audio_queue.put((audio_data, sample_rate)),
-                    timeout=0.1
-                )
-            except asyncio.TimeoutError:
-                # Queue is full, drop this chunk
-                print("Warning: Audio queue full, dropping frame")
-                return
         except Exception as e:
-            print(f"Error in FastRTC audio receive: {e}")
-            import traceback
-            traceback.print_exc()
-    async def _process_audio_loop(self):
-        """Background task to process audio from queue"""
-        while self.is_processing:
-            try:
-                # Get from queue with timeout to allow checking is_processing flag
-                try:
-                    audio_data, sample_rate = await asyncio.wait_for(
-                        self.audio_queue.get(),
-                        timeout=0.5
-                    )
-                except asyncio.TimeoutError:
-                    # No audio available, check if we should keep running
-                    continue
-                # Convert to numpy array if needed
-                if isinstance(audio_data, bytes):
-                    # Convert bytes to numpy array (assuming 16-bit PCM)
-                    audio_array = np.frombuffer(audio_data, dtype=np.int16)
-                    # Normalize to float32 range [-1, 1]
-                    audio_array = audio_array.astype(np.float32) / 32768.0
-                elif isinstance(audio_data, (list, tuple)):
-                    audio_array = np.array(audio_data, dtype=np.float32)
-                elif isinstance(audio_data, np.ndarray):
-                    audio_array = audio_array.astype(np.float32)
-                else:
-                    print(f"Unknown audio data type: {type(audio_data)}")
-                    continue
-                # Ensure mono audio
-                if len(audio_array.shape) > 1 and audio_array.shape[1] > 1:
-                    audio_array = np.mean(audio_array, axis=1)
-                # Ensure 1D array
-                if len(audio_array.shape) > 1:
-                    audio_array = audio_array.flatten()
-                # Process audio through thread pool to avoid blocking event loop
-                await self.process_audio_async(audio_array, sample_rate)
-                # Mark as done
-                self.audio_queue.task_done()
-            except Exception as e:
-                print(f"Error in audio processing loop: {e}")
-                import traceback
-                traceback.print_exc()
-                # Short sleep to avoid tight loop
-                await asyncio.sleep(0.1)
-    async def process_audio_async(self, audio_data, sample_rate=16000):
         """Process audio data asynchronously"""
         try:
-            # Run the audio processing in a thread pool to avoid blocking
             loop = asyncio.get_event_loop()
             await loop.run_in_executor(
                 None,
                 self.diarization_system.process_audio_chunk,
                 audio_data,
-                sample_rate
             )
         except Exception as e:
-            print(f"Error in async audio processing: {e}")
-    async def start_up(self) -> None:
-        """Initialize any resources when the stream starts"""
-        print("FastRTC stream started")
-        self.is_processing = True
-        # Start background processing task
-        self.processing_task = asyncio.create_task(self._process_audio_loop())
-    async def shutdown(self) -> None:
-        """Clean up any resources when the stream ends"""
-        print("FastRTC stream shutting down")
-        self.is_processing = False
-        # Wait for processing task to finish
-        if self.processing_task:
-            try:
-                # Cancel and wait for task
-                self.processing_task.cancel()
-                await asyncio.wait([self.processing_task], timeout=2.0)
-            except (asyncio.CancelledError, Exception) as e:
-                print(f"Error cancelling audio processing task: {e}")
-        # Clear queue
-        while not self.audio_queue.empty():
-            try:
-                self.audio_queue.get_nowait()
-                self.audio_queue.task_done()
-            except:
-                pass
 # Global instances
-diarization_system = None  # Will be initialized when RealtimeSpeakerDiarization is available
 audio_handler = None
 def initialize_system():
     """Initialize the diarization system"""
-    global audio_handler, diarization_system
     try:
-        if diarization_system is None:
-            print("Error: RealtimeSpeakerDiarization not initialized")
-            return "❌ Diarization system not available. Please ensure RealtimeSpeakerDiarization is properly imported."
         success = diarization_system.initialize_models()
         if success:
             audio_handler = DiarizationHandler(diarization_system)
-            return "✅ System initialized successfully! Models loaded and FastRTC handler ready."
         else:
-            return "❌ Failed to initialize system. Please check the logs."
     except Exception as e:
-        print(f"Initialization error: {e}")
         return f"❌ Initialization error: {str(e)}"
 def start_recording():
     """Start recording and transcription"""
     try:
-        if diarization_system is None:
-            return "❌ System not initialized"
         result = diarization_system.start_recording()
-        return f"🎙️ {result} - FastRTC audio streaming is active."
     except Exception as e:
         return f"❌ Failed to start recording: {str(e)}"
 def stop_recording():
     """Stop recording and transcription"""
     try:
-        if diarization_system is None:
-            return "❌ System not initialized"
         result = diarization_system.stop_recording()
         return f"⏹️ {result}"
     except Exception as e:
         return f"❌ Failed to stop recording: {str(e)}"
 def clear_conversation():
     """Clear the conversation"""
     try:
-        if diarization_system is None:
-            return "❌ System not initialized"
         result = diarization_system.clear_conversation()
         return f"🗑️ {result}"
     except Exception as e:
         return f"❌ Failed to clear conversation: {str(e)}"
 def update_settings(threshold, max_speakers):
     """Update system settings"""
     try:
-        if diarization_system is None:
-            return "❌ System not initialized"
         result = diarization_system.update_settings(threshold, max_speakers)
         return f"⚙️ {result}"
     except Exception as e:
         return f"❌ Failed to update settings: {str(e)}"
 def get_conversation():
     """Get the current conversation"""
     try:
-        if diarization_system is None:
-            return "<i>System not initialized</i>"
         return diarization_system.get_formatted_conversation()
     except Exception as e:
         return f"<i>Error getting conversation: {str(e)}</i>"
 def get_status():
     """Get system status"""
     try:
-        if diarization_system is None:
-            return "System not initialized"
         return diarization_system.get_status_info()
     except Exception as e:
         return f"Error getting status: {str(e)}"
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Soft()) as interface:
         gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
-        gr.Markdown("This app performs real-time speech recognition with automatic speaker identification using FastRTC for low-latency audio streaming.")
         with gr.Row():
             with gr.Column(scale=2):
-                # Main conversation display
                 conversation_output = gr.HTML(
-                    value="<div style='padding: 20px; background: #f5f5f5; border-radius: 10px;'><i>Click 'Initialize System' to start...</i></div>",
-                    label="Live Conversation",
-                    elem_id="conversation_display"
                 )
                 # Control buttons
                 with gr.Row():
                     init_btn = gr.Button("🔧 Initialize System", variant="secondary", size="lg")
-                    start_btn = gr.Button("🎙️ Start Recording", variant="primary", size="lg", interactive=False)
-                    stop_btn = gr.Button("⏹️ Stop Recording", variant="stop", size="lg", interactive=False)
                     clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="lg", interactive=False)
-                # FastRTC Stream Interface
-                with gr.Row():
-                    gr.HTML("""
-                    <div id="fastrtc-container" style="border: 2px solid #ddd; border-radius: 10px; padding: 20px; margin: 10px 0;">
-                        <h3>🎵 Audio Stream</h3>
-                        <p>FastRTC audio stream will appear here when recording starts.</p>
-                        <div id="stream-status" style="padding: 10px; background: #f8f9fa; border-radius: 5px; margin-top: 10px;">
-                            Status: Waiting for initialization...
-                        </div>
-                    </div>
-                    """)
                 # Status display
                 status_output = gr.Textbox(
                     label="System Status",
-                    value="System not initialized. Please click 'Initialize System' to begin.",
-                    lines=6,
-                    interactive=False,
-                    show_copy_button=True
                 )
             with gr.Column(scale=1):
-                # Settings panel
                 gr.Markdown("## ⚙️ Settings")
                 threshold_slider = gr.Slider(
-                    minimum=0.1,
-                    maximum=0.95,
                     step=0.05,
-                    value=0.5,  # DEFAULT_CHANGE_THRESHOLD
                     label="Speaker Change Sensitivity",
-                    info="Lower = more sensitive to speaker changes"
                 )
                 max_speakers_slider = gr.Slider(
                     minimum=2,
-                    maximum=10,  # ABSOLUTE_MAX_SPEAKERS
                     step=1,
-                    value=4,  # DEFAULT_MAX_SPEAKERS
-                    label="Maximum Number of Speakers"
                 )
-                update_settings_btn = gr.Button("Update Settings", variant="secondary")
-                # Audio settings
-                gr.Markdown("## 🔊 Audio Configuration")
-                with gr.Accordion("Advanced Audio Settings", open=False):
-                    gr.Markdown("""
-                    **Current Configuration:**
-                    - Sample Rate: 16kHz
-                    - Audio Format: 16-bit PCM → Float32 (via AudioProcessor)
-                    - Channels: Mono (stereo converted automatically)
-                    - Buffer Size: 1024 samples for real-time processing
-                    - Processing: Uses existing AudioProcessor.extract_embedding()
-                    """)
                 # Instructions
-                gr.Markdown("## 📝 How to Use")
                 gr.Markdown("""
-                1. **Initialize**: Click "Initialize System" to load AI models
-                2. **Start**: Click "Start Recording" to begin processing
-                3. **Connect**: The FastRTC stream will activate automatically
-                4. **Allow Access**: Grant microphone permissions when prompted
-                5. **Speak**: Talk naturally into your microphone
-                6. **Monitor**: Watch real-time transcription with speaker colors
                 """)
-                # Performance tips
-                with gr.Accordion("💡 Performance Tips", open=False):
-                    gr.Markdown("""
-                    - Use Chrome/Edge for best FastRTC performance
-                    - Ensure stable internet connection
-                    - Use headphones to prevent echo
-                    - Position microphone 6-12 inches away
-                    - Minimize background noise
-                    - Allow browser microphone access
-                    """)
-                # Speaker color legend
-                gr.Markdown("## 🎨 Speaker Colors")
-                speaker_colors = [
-                    ("#FF6B6B", "Red"),
-                    ("#4ECDC4", "Teal"),
-                    ("#45B7D1", "Blue"),
-                    ("#96CEB4", "Green"),
-                    ("#FFEAA7", "Yellow"),
-                    ("#DDA0DD", "Plum"),
-                    ("#98D8C8", "Mint"),
-                    ("#F7DC6F", "Gold")
-                ]
-                color_html = ""
-                for i, (color, name) in enumerate(speaker_colors[:4]):
-                    color_html += f'<div style="margin: 3px 0;"><span style="color:{color}; font-size: 16px; font-weight: bold;">●</span> Speaker {i+1} ({name})</div>'
-                gr.HTML(f"<div style='font-size: 14px;'>{color_html}</div>")
-        # Auto-refresh conversation and status
-        def refresh_display():
-            try:
-                conversation = get_conversation()
-                status = get_status()
-                return conversation, status
-            except Exception as e:
-                error_msg = f"Error refreshing display: {str(e)}"
-                return f"<i>{error_msg}</i>", error_msg
         # Event handlers
         def on_initialize():
-            try:
-                result = initialize_system()
-                success = "successfully" in result.lower()
-                conversation, status = refresh_display()
-                return (
-                    result,  # status_output
-                    gr.update(interactive=success),   # start_btn
-                    gr.update(interactive=success),   # clear_btn
-                    conversation,  # conversation_output
-                )
-            except Exception as e:
-                error_msg = f"❌ Initialization failed: {str(e)}"
-                return (
-                    error_msg,
-                    gr.update(interactive=False),
-                    gr.update(interactive=False),
-                    "<i>System not ready</i>",
-                )
         def on_start():
-            try:
-                result = start_recording()
-                return (
-                    result,  # status_output
-                    gr.update(interactive=False),  # start_btn
-                    gr.update(interactive=True),   # stop_btn
-                )
-            except Exception as e:
-                error_msg = f"❌ Failed to start: {str(e)}"
-                return (
-                    error_msg,
-                    gr.update(interactive=True),
-                    gr.update(interactive=False),
-                )
         def on_stop():
-            try:
-                result = stop_recording()
-                return (
-                    result,  # status_output
-                    gr.update(interactive=True),   # start_btn
-                    gr.update(interactive=False),  # stop_btn
-                )
-            except Exception as e:
-                error_msg = f"❌ Failed to stop: {str(e)}"
-                return (
-                    error_msg,
-                    gr.update(interactive=False),
-                    gr.update(interactive=True),
-                )
         def on_clear():
-            try:
-                result = clear_conversation()
-                conversation, status = refresh_display()
-                return result, conversation
-            except Exception as e:
-                error_msg = f"❌ Failed to clear: {str(e)}"
-                return error_msg, "<i>Error clearing conversation</i>"
         def on_update_settings(threshold, max_speakers):
-            try:
-                result = update_settings(threshold, max_speakers)
-                return result
-            except Exception as e:
-                return f"❌ Failed to update settings: {str(e)}"
-        # Connect event handlers
         init_btn.click(
-            on_initialize,
-            outputs=[status_output, start_btn, clear_btn, conversation_output]
         )
         start_btn.click(
-            on_start,
             outputs=[status_output, start_btn, stop_btn]
         )
         stop_btn.click(
-            on_stop,
             outputs=[status_output, start_btn, stop_btn]
         )
         clear_btn.click(
-            on_clear,
-            outputs=[status_output, conversation_output]
         )
-        update_settings_btn.click(
-            on_update_settings,
             inputs=[threshold_slider, max_speakers_slider],
             outputs=[status_output]
         )
-        # Auto-refresh every 2 seconds when active
-        refresh_timer = gr.Timer(2.0)
-        refresh_timer.tick(
-            refresh_display,
-            outputs=[conversation_output, status_output]
         )
     return interface
-# FastAPI setup for API endpoints
 def create_fastapi_app():
-    """Create FastAPI app with API endpoints"""
-    app = FastAPI(
-        title="Real-time Speaker Diarization",
-        description="Real-time speech recognition with speaker diarization using FastRTC",
-        version="1.0.0"
-    )
-    # API Routes
-    router = APIRouter()
-    @router.get("/health")
-    async def health_check():
-        """Health check endpoint"""
-        return {
-            "status": "healthy",
-            "timestamp": time.time(),
-            "system_initialized": diarization_system is not None and hasattr(diarization_system, 'encoder') and diarization_system.encoder is not None,
-            "recording_active": diarization_system.is_running if diarization_system and hasattr(diarization_system, 'is_running') else False
-        }
-    @router.get("/api/conversation")
     async def get_conversation_api():
-        """Get current conversation"""
         try:
             return {
-                "conversation": get_conversation(),
-                "status": get_status(),
-                "is_recording": diarization_system.is_running if diarization_system and hasattr(diarization_system, 'is_running') else False,
-                "timestamp": time.time()
             }
         except Exception as e:
-            return {"error": str(e), "timestamp": time.time()}
-    @router.post("/api/control/{action}")
-    async def control_recording(action: str):
-        """Control recording actions"""
         try:
-            if action == "start":
-                result = start_recording()
-            elif action == "stop":
-                result = stop_recording()
-            elif action == "clear":
-                result = clear_conversation()
-            elif action == "initialize":
-                result = initialize_system()
-            else:
-                return {"error": "Invalid action. Use: start, stop, clear, or initialize"}
-            return {
-                "result": result,
-                "is_recording": diarization_system.is_running if diarization_system and hasattr(diarization_system, 'is_running') else False,
-                "timestamp": time.time()
-            }
         except Exception as e:
-            return {"error": str(e), "timestamp": time.time()}
-    app.include_router(router)
     return app
-# Function to setup FastRTC stream
-def setup_fastrtc_stream(app):
-    """Setup FastRTC stream with proper configuration"""
-    try:
-        if audio_handler is None:
-            print("Warning: Audio handler not initialized. Initialize system first.")
-            return None
-        # Get HuggingFace token for TURN server (optional)
-        hf_token = os.environ.get("HF_TOKEN")
-        # Configure RTC settings
-        rtc_config = {
-            "iceServers": [
-                {"urls": "stun:stun.l.google.com:19302"},
-                {"urls": "stun:stun1.l.google.com:19302"}
-            ]
-        }
-        # Create FastRTC stream
-        stream = Stream(
-            handler=audio_handler,
-            rtc_configuration=rtc_config,
-            modality="audio",
-            mode="receive"  # We only receive audio, don't send
-        )
-        # Mount the stream
-        app.mount("/stream", stream)
-        print("✅ FastRTC stream configured successfully!")
-        return stream
-    except Exception as e:
-        print(f"⚠️ Warning: Failed to setup FastRTC stream: {e}")
-        print("Audio streaming may not work properly.")
-        return None
-# Main application setup
-def create_app(diarization_sys=None):
-    """Create the complete application"""
-    global diarization_system
-    # Set the diarization system
-    if diarization_sys is not None:
-        diarization_system = diarization_sys
-    # Create FastAPI app
-    fastapi_app = create_fastapi_app()
-    # Create Gradio interface
-    gradio_interface = create_interface()
-    # Mount Gradio on FastAPI
-    app = gr.mount_gradio_app(fastapi_app, gradio_interface, path="/")
-    # Setup FastRTC stream
-    if diarization_system is not None:
-        # Initialize the system if not already done
-        if not hasattr(diarization_system, 'encoder') or diarization_system.encoder is None:
-            diarization_system.initialize_models()
-        # Create audio handler if needed
-        global audio_handler
-        if audio_handler is None:
-            audio_handler = DiarizationHandler(diarization_system)
-        # Setup and mount the FastRTC stream
-        setup_fastrtc_stream(app)
-    return app, gradio_interface
-# Entry point for HuggingFace Spaces
-if __name__ == "__main__":
-    try:
-        # Import your diarization system here
-        # from your_module import RealtimeSpeakerDiarization
-        diarization_system = RealtimeSpeakerDiarization()
-        # Create the application
-        app, interface = create_app()
-        # Launch for HuggingFace Spaces
         interface.launch(
-            server_name="0.0.0.0",
-            server_port=7860,
             share=True,
-            show_error=True,
-            quiet=False
         )
     except Exception as e:
-        print(f"Failed to launch application: {e}")
-        import traceback
-        traceback.print_exc()
-        # Fallback - launch just Gradio interface
-        try:
-            interface = create_interface()
-            interface.launch(
-                server_name="0.0.0.0",
-                server_port=int(os.environ.get("PORT", 7860)),
-                share=False
-            )
-        except Exception as fallback_error:
-            print(f"Fallback launch also failed: {fallback_error}")
-# Helper function to initialize with your diarization system
-def initialize_with_diarization_system(diarization_sys):
-    """Initialize the application with your diarization system"""
-    global diarization_system
-    diarization_system = diarization_sys
-    return create_app(diarization_sys)

 from scipy.spatial.distance import cosine
 from RealtimeSTT import AudioToTextRecorder
 from fastapi import FastAPI, APIRouter
+from fastrtc import Stream, AsyncStreamHandler
 import json
 import asyncio
 import uvicorn
 from queue import Queue
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Simplified configuration parameters
 SILENCE_THRESHS = [0, 0.4]
 FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
 PRE_RECORDING_BUFFER_DURATION = 0.35
 # Speaker change detection parameters
+DEFAULT_CHANGE_THRESHOLD = 0.65
 EMBEDDING_HISTORY_SIZE = 5
+MIN_SEGMENT_DURATION = 1.5
 DEFAULT_MAX_SPEAKERS = 4
+ABSOLUTE_MAX_SPEAKERS = 8
 # Global variables
 SAMPLE_RATE = 16000
+BUFFER_SIZE = 1024
 CHANNELS = 1
+# Speaker colors - more distinguishable colors
 SPEAKER_COLORS = [
+    "#FF6B6B",  # Red
+    "#4ECDC4",  # Teal
+    "#45B7D1",  # Blue
+    "#96CEB4",  # Green
+    "#FFEAA7",  # Yellow
+    "#DDA0DD",  # Plum
+    "#98D8C8",  # Mint
+    "#F7DC6F",  # Gold
 ]
 SPEAKER_COLOR_NAMES = [
+    "Red", "Teal", "Blue", "Green", "Yellow", "Plum", "Mint", "Gold"
 ]
         self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
         os.makedirs(self.cache_dir, exist_ok=True)
     def load_model(self):
         """Load the ECAPA-TDNN model"""
         try:
             from speechbrain.pretrained import EncoderClassifier
             self.model = EncoderClassifier.from_hparams(
                 source="speechbrain/spkrec-ecapa-voxceleb",
                 savedir=self.cache_dir,
             )
             self.model_loaded = True
+            logger.info("ECAPA-TDNN model loaded successfully!")
             return True
         except Exception as e:
+            logger.error(f"Error loading ECAPA-TDNN model: {e}")
             return False
     def embed_utterance(self, audio, sr=16000):
         try:
             if isinstance(audio, np.ndarray):
+                # Ensure audio is float32 and properly normalized
+                audio = audio.astype(np.float32)
+                if np.max(np.abs(audio)) > 1.0:
+                    audio = audio / np.max(np.abs(audio))
+                waveform = torch.tensor(audio).unsqueeze(0)
             else:
                 waveform = audio.unsqueeze(0)
+            # Resample if necessary
             if sr != 16000:
                 waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
             return embedding.squeeze().cpu().numpy()
         except Exception as e:
+            logger.error(f"Error extracting embedding: {e}")
             return np.zeros(self.embedding_dim)
     """Processes audio data to extract speaker embeddings"""
     def __init__(self, encoder):
         self.encoder = encoder
+        self.audio_buffer = []
+        self.min_audio_length = int(SAMPLE_RATE * 1.0)  # Minimum 1 second of audio
+    def add_audio_chunk(self, audio_chunk):
+        """Add audio chunk to buffer"""
+        self.audio_buffer.extend(audio_chunk)
+        # Keep buffer from getting too large
+        max_buffer_size = int(SAMPLE_RATE * 10)  # 10 seconds max
+        if len(self.audio_buffer) > max_buffer_size:
+            self.audio_buffer = self.audio_buffer[-max_buffer_size:]
+    def extract_embedding_from_buffer(self):
+        """Extract embedding from current audio buffer"""
+        if len(self.audio_buffer) < self.min_audio_length:
+            return None
+        try:
+            # Use the last portion of the buffer for embedding
+            audio_segment = np.array(self.audio_buffer[-self.min_audio_length:], dtype=np.float32)
+            # Normalize audio
+            if np.max(np.abs(audio_segment)) > 0:
+                audio_segment = audio_segment / np.max(np.abs(audio_segment))
+            else:
+                return None
+            embedding = self.encoder.embed_utterance(audio_segment)
             return embedding
         except Exception as e:
+            logger.error(f"Embedding extraction error: {e}")
+            return None
 class SpeakerChangeDetector:
+    """Improved speaker change detector"""
     def __init__(self, embedding_dim=192, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
         self.embedding_dim = embedding_dim
         self.change_threshold = change_threshold
         self.max_speakers = min(max_speakers, ABSOLUTE_MAX_SPEAKERS)
         self.current_speaker = 0
         self.speaker_embeddings = [[] for _ in range(self.max_speakers)]
+        self.speaker_centroids = [None] * self.max_speakers
+        self.last_change_time = time.time()
+        self.last_similarity = 1.0
         self.active_speakers = set([0])
+        self.segment_counter = 0
     def set_max_speakers(self, max_speakers):
         """Update the maximum number of speakers"""
         new_max = min(max_speakers, ABSOLUTE_MAX_SPEAKERS)
         if new_max < self.max_speakers:
+            # Remove speakers beyond the new limit
             for speaker_id in list(self.active_speakers):
                 if speaker_id >= new_max:
                     self.active_speakers.discard(speaker_id)
             if self.current_speaker >= new_max:
                 self.current_speaker = 0
+        # Resize arrays
         if new_max > self.max_speakers:
             self.speaker_embeddings.extend([[] for _ in range(new_max - self.max_speakers)])
+            self.speaker_centroids.extend([None] * (new_max - self.max_speakers))
         else:
             self.speaker_embeddings = self.speaker_embeddings[:new_max]
+            self.speaker_centroids = self.speaker_centroids[:new_max]
         self.max_speakers = new_max
     def set_change_threshold(self, threshold):
         """Update the threshold for detecting speaker changes"""
+        self.change_threshold = max(0.1, min(threshold, 0.95))
     def add_embedding(self, embedding, timestamp=None):
+        """Add a new embedding and detect speaker changes"""
         current_time = timestamp or time.time()
+        self.segment_counter += 1
+        # Initialize first speaker
+        if not self.speaker_embeddings[0]:
+            self.speaker_embeddings[0].append(embedding)
+            self.speaker_centroids[0] = embedding.copy()
+            self.active_speakers.add(0)
+            return 0, 1.0
+        # Calculate similarity with current speaker
+        current_centroid = self.speaker_centroids[self.current_speaker]
+        if current_centroid is not None:
+            similarity = 1.0 - cosine(embedding, current_centroid)
         else:
+            similarity = 0.5
         self.last_similarity = similarity
+        # Check for speaker change
         time_since_last_change = current_time - self.last_change_time
+        speaker_changed = False
+        if time_since_last_change >= MIN_SEGMENT_DURATION and similarity < self.change_threshold:
+            # Find best matching speaker
+            best_speaker = self.current_speaker
+            best_similarity = similarity
+            for speaker_id in self.active_speakers:
+                if speaker_id == self.current_speaker:
+                    continue
+                centroid = self.speaker_centroids[speaker_id]
+                if centroid is not None:
+                    speaker_similarity = 1.0 - cosine(embedding, centroid)
+                    if speaker_similarity > best_similarity and speaker_similarity > self.change_threshold:
+                        best_similarity = speaker_similarity
+                        best_speaker = speaker_id
+            # If no good match found and we can add a new speaker
+            if best_speaker == self.current_speaker and len(self.active_speakers) < self.max_speakers:
+                for new_id in range(self.max_speakers):
+                    if new_id not in self.active_speakers:
+                        best_speaker = new_id
+                        self.active_speakers.add(new_id)
+                        break
+            if best_speaker != self.current_speaker:
+                self.current_speaker = best_speaker
+                self.last_change_time = current_time
+                speaker_changed = True
+        # Update speaker embeddings and centroids
         self.speaker_embeddings[self.current_speaker].append(embedding)
+        # Keep only recent embeddings (sliding window)
+        max_embeddings = 20
+        if len(self.speaker_embeddings[self.current_speaker]) > max_embeddings:
+            self.speaker_embeddings[self.current_speaker] = self.speaker_embeddings[self.current_speaker][-max_embeddings:]
+        # Update centroid
         if self.speaker_embeddings[self.current_speaker]:
+            self.speaker_centroids[self.current_speaker] = np.mean(
                 self.speaker_embeddings[self.current_speaker], axis=0
             )
         return "#FFFFFF"
     def get_status_info(self):
+        """Return status information"""
         speaker_counts = [len(self.speaker_embeddings[i]) for i in range(self.max_speakers)]
         return {
             "active_speakers": len(self.active_speakers),
             "max_speakers": self.max_speakers,
             "last_similarity": self.last_similarity,
+            "threshold": self.change_threshold,
+            "segment_counter": self.segment_counter
         }
         self.audio_processor = None
         self.speaker_detector = None
         self.recorder = None
+        self.sentence_queue = queue.Queue()
         self.full_sentences = []
         self.sentence_speakers = []
         self.pending_sentences = []
+        self.current_conversation = ""
         self.is_running = False
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
+        self.last_transcription = ""
+        self.transcription_lock = threading.Lock()
     def initialize_models(self):
         """Initialize the speaker encoder model"""
         try:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
+            logger.info(f"Using device: {device_str}")
             self.encoder = SpeechBrainEncoder(device=device_str)
+            success = self.encoder.load_model()
+            if success:
                 self.audio_processor = AudioProcessor(self.encoder)
                 self.speaker_detector = SpeakerChangeDetector(
                     embedding_dim=self.encoder.embedding_dim,
                     change_threshold=self.change_threshold,
                     max_speakers=self.max_speakers
                 )
+                logger.info("Models initialized successfully!")
                 return True
             else:
+                logger.error("Failed to load models")
+                return False
         except Exception as e:
+            logger.error(f"Model initialization error: {e}")
             return False
     def live_text_detected(self, text):
         """Callback for real-time transcription updates"""
+        with self.transcription_lock:
+            self.last_transcription = text.strip()
     def process_final_text(self, text):
         """Process final transcribed text with speaker embedding"""
         text = text.strip()
         if text:
             try:
+                # Get audio data for this transcription
+                audio_bytes = getattr(self.recorder, 'last_transcription_bytes', None)
+                if audio_bytes:
+                    self.sentence_queue.put((text, audio_bytes))
+                else:
+                    # If no audio bytes, use current speaker
+                    self.sentence_queue.put((text, None))
             except Exception as e:
+                logger.error(f"Error processing final text: {e}")
     def process_sentence_queue(self):
         """Process sentences in the queue for speaker detection"""
         while self.is_running:
             try:
+                text, audio_bytes = self.sentence_queue.get(timeout=1)
+                current_speaker = self.speaker_detector.current_speaker
+                if audio_bytes:
+                    # Convert audio data and extract embedding
+                    audio_int16 = np.frombuffer(audio_bytes, dtype=np.int16)
+                    audio_float = audio_int16.astype(np.float32) / 32768.0
+                    # Extract embedding
+                    embedding = self.audio_processor.encoder.embed_utterance(audio_float)
+                    if embedding is not None:
+                        current_speaker, similarity = self.speaker_detector.add_embedding(embedding)
+                # Store sentence with speaker
+                with self.transcription_lock:
+                    self.full_sentences.append((text, current_speaker))
+                    self.update_conversation_display()
             except queue.Empty:
                 continue
             except Exception as e:
+                logger.error(f"Error processing sentence: {e}")
+    def update_conversation_display(self):
+        """Update the conversation display"""
+        try:
+            sentences_with_style = []
+            for sentence_text, speaker_id in self.full_sentences:
+                color = self.speaker_detector.get_color_for_speaker(speaker_id)
+                speaker_name = f"Speaker {speaker_id + 1}"
+                sentences_with_style.append(
+                    f'<span style="color:{color}; font-weight: bold;">{speaker_name}:</span> '
+                    f'<span style="color:#333333;">{sentence_text}</span>'
+                )
+            # Add current transcription if available
+            if self.last_transcription:
+                current_color = self.speaker_detector.get_color_for_speaker(self.speaker_detector.current_speaker)
+                current_speaker = f"Speaker {self.speaker_detector.current_speaker + 1}"
+                sentences_with_style.append(
+                    f'<span style="color:{current_color}; font-weight: bold; opacity: 0.7;">{current_speaker}:</span> '
+                    f'<span style="color:#666666; font-style: italic;">{self.last_transcription}...</span>'
+                )
+            if sentences_with_style:
+                self.current_conversation = "<br><br>".join(sentences_with_style)
+            else:
+                self.current_conversation = "<i>Waiting for speech input...</i>"
+        except Exception as e:
+            logger.error(f"Error updating conversation display: {e}")
+            self.current_conversation = f"<i>Error: {str(e)}</i>"
     def start_recording(self):
         """Start the recording and transcription process"""
             return "Please initialize models first!"
         try:
+            # Setup recorder configuration
             recorder_config = {
                 'spinner': False,
+                'use_microphone': True,  # Changed to True for direct microphone input
                 'model': FINAL_TRANSCRIPTION_MODEL,
                 'language': TRANSCRIPTION_LANGUAGE,
                 'silero_sensitivity': SILERO_SENSITIVITY,
                 'pre_recording_buffer_duration': PRE_RECORDING_BUFFER_DURATION,
                 'min_gap_between_recordings': 0,
                 'enable_realtime_transcription': True,
+                'realtime_processing_pause': 0.1,
                 'realtime_model_type': REALTIME_TRANSCRIPTION_MODEL,
                 'on_realtime_transcription_update': self.live_text_detected,
                 'beam_size': FINAL_BEAM_SIZE,
                 'beam_size_realtime': REALTIME_BEAM_SIZE,
                 'sample_rate': SAMPLE_RATE,
             }
             self.recorder = AudioToTextRecorder(**recorder_config)
+            # Start processing threads
             self.is_running = True
             self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
             self.sentence_thread.start()
             self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
             self.transcription_thread.start()
+            return "Recording started successfully!"
         except Exception as e:
+            logger.error(f"Error starting recording: {e}")
+            return f"Error starting recording: {e}"
     def run_transcription(self):
         """Run the transcription loop"""
             while self.is_running:
                 self.recorder.text(self.process_final_text)
         except Exception as e:
+            logger.error(f"Transcription error: {e}")
     def stop_recording(self):
         """Stop the recording process"""
         self.is_running = False
         if self.recorder:
             self.recorder.stop()
         return "Recording stopped!"
     def clear_conversation(self):
         """Clear all conversation data"""
+        with self.transcription_lock:
+            self.full_sentences = []
+            self.last_transcription = ""
+            self.current_conversation = "Conversation cleared!"
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
         return f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
     def get_formatted_conversation(self):
+        """Get the formatted conversation"""
+        return self.current_conversation
     def get_status_info(self):
         """Get current status information"""
                 f"**Last Similarity:** {status['last_similarity']:.3f}",
                 f"**Change Threshold:** {status['threshold']:.2f}",
                 f"**Total Sentences:** {len(self.full_sentences)}",
+                f"**Segments Processed:** {status['segment_counter']}",
                 "",
+                "**Speaker Activity:**"
             ]
             for i in range(status['max_speakers']):
                 color_name = SPEAKER_COLOR_NAMES[i] if i < len(SPEAKER_COLOR_NAMES) else f"Speaker {i+1}"
+                count = status['speaker_counts'][i]
+                active = "🟢" if count > 0 else "⚫"
+                status_lines.append(f"{active} Speaker {i+1} ({color_name}): {count} segments")
             return "\n".join(status_lines)
         except Exception as e:
             return f"Error getting status: {e}"
     def process_audio_chunk(self, audio_data, sample_rate=16000):
         """Process audio chunk from FastRTC input"""
+        if not self.is_running or self.audio_processor is None:
             return
         try:
+            # Ensure audio is float32
+            if isinstance(audio_data, np.ndarray):
+                if audio_data.dtype != np.float32:
+                    audio_data = audio_data.astype(np.float32)
+            else:
+                audio_data = np.array(audio_data, dtype=np.float32)
+            # Ensure mono
+            if len(audio_data.shape) > 1:
+                audio_data = np.mean(audio_data, axis=1) if audio_data.shape[1] > 1 else audio_data.flatten()
+            # Normalize if needed
+            if np.max(np.abs(audio_data)) > 1.0:
+                audio_data = audio_data / np.max(np.abs(audio_data))
+            # Add to audio processor buffer for speaker detection
+            self.audio_processor.add_audio_chunk(audio_data)
+            # Periodically extract embeddings for speaker detection
+            if len(self.audio_processor.audio_buffer) % (SAMPLE_RATE // 2) == 0:  # Every 0.5 seconds
+                embedding = self.audio_processor.extract_embedding_from_buffer()
+                if embedding is not None:
+                    self.speaker_detector.add_embedding(embedding)
         except Exception as e:
+            logger.error(f"Error processing audio chunk: {e}")
+# FastRTC Audio Handler
 class DiarizationHandler(AsyncStreamHandler):
     def __init__(self, diarization_system):
         super().__init__()
         self.diarization_system = diarization_system
+        self.audio_buffer = []
+        self.buffer_size = BUFFER_SIZE
     def copy(self):
         """Return a fresh handler for each new stream connection"""
         return DiarizationHandler(self.diarization_system)
     async def emit(self):
+        """Not used - we only receive audio"""
         return None
     async def receive(self, frame):
+        """Receive audio data from FastRTC"""
         try:
             if not self.diarization_system.is_running:
                 return
+            # Extract audio data
+            audio_data = getattr(frame, 'data', frame)
+            # Convert to numpy array
+            if isinstance(audio_data, bytes):
+                audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
+            elif isinstance(audio_data, (list, tuple)):
+                audio_array = np.array(audio_data, dtype=np.float32)
             else:
+                audio_array = np.array(audio_data, dtype=np.float32)
+            # Ensure 1D
+            if len(audio_array.shape) > 1:
+                audio_array = audio_array.flatten()
+            # Buffer audio chunks
+            self.audio_buffer.extend(audio_array)
+            # Process in chunks
+            while len(self.audio_buffer) >= self.buffer_size:
+                chunk = np.array(self.audio_buffer[:self.buffer_size])
+                self.audio_buffer = self.audio_buffer[self.buffer_size:]
+                # Process asynchronously
+                await self.process_audio_async(chunk)
         except Exception as e:
+            logger.error(f"Error in FastRTC receive: {e}")
+    async def process_audio_async(self, audio_data):
         """Process audio data asynchronously"""
         try:
             loop = asyncio.get_event_loop()
             await loop.run_in_executor(
                 None,
                 self.diarization_system.process_audio_chunk,
                 audio_data,
+                SAMPLE_RATE
             )
         except Exception as e:
+            logger.error(f"Error in async audio processing: {e}")
 # Global instances
+diarization_system = RealtimeSpeakerDiarization()
 audio_handler = None
 def initialize_system():
     """Initialize the diarization system"""
+    global audio_handler
     try:
         success = diarization_system.initialize_models()
         if success:
             audio_handler = DiarizationHandler(diarization_system)
+            return "✅ System initialized successfully!"
         else:
+            return "❌ Failed to initialize system. Check logs for details."
     except Exception as e:
+        logger.error(f"Initialization error: {e}")
         return f"❌ Initialization error: {str(e)}"
 def start_recording():
     """Start recording and transcription"""
     try:
         result = diarization_system.start_recording()
+        return f"🎙️ {result}"
     except Exception as e:
         return f"❌ Failed to start recording: {str(e)}"
 def stop_recording():
     """Stop recording and transcription"""
     try:
         result = diarization_system.stop_recording()
         return f"⏹️ {result}"
     except Exception as e:
         return f"❌ Failed to stop recording: {str(e)}"
 def clear_conversation():
     """Clear the conversation"""
     try:
         result = diarization_system.clear_conversation()
         return f"🗑️ {result}"
     except Exception as e:
         return f"❌ Failed to clear conversation: {str(e)}"
 def update_settings(threshold, max_speakers):
     """Update system settings"""
     try:
         result = diarization_system.update_settings(threshold, max_speakers)
         return f"⚙️ {result}"
     except Exception as e:
         return f"❌ Failed to update settings: {str(e)}"
 def get_conversation():
     """Get the current conversation"""
     try:
         return diarization_system.get_formatted_conversation()
     except Exception as e:
         return f"<i>Error getting conversation: {str(e)}</i>"
 def get_status():
     """Get system status"""
     try:
         return diarization_system.get_status_info()
     except Exception as e:
         return f"Error getting status: {str(e)}"
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Soft()) as interface:
         gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
+        gr.Markdown("Live transcription with automatic speaker identification using FastRTC audio streaming.")
         with gr.Row():
             with gr.Column(scale=2):
+                # Conversation display
                 conversation_output = gr.HTML(
+                    value="<div style='padding: 20px; background: #f8f9fa; border-radius: 10px; min-height: 300px;'><i>Click 'Initialize System' to start...</i></div>",
+                    label="Live Conversation"
                 )
                 # Control buttons
                 with gr.Row():
                     init_btn = gr.Button("🔧 Initialize System", variant="secondary", size="lg")
+                    start_btn = gr.Button("🎙️ Start", variant="primary", size="lg", interactive=False)
+                    stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", interactive=False)
                     clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="lg", interactive=False)
                 # Status display
                 status_output = gr.Textbox(
                     label="System Status",
+                    value="Ready to initialize...",
+                    lines=8,
+                    interactive=False
                 )
             with gr.Column(scale=1):
+                # Settings
                 gr.Markdown("## ⚙️ Settings")
                 threshold_slider = gr.Slider(
+                    minimum=0.3,
+                    maximum=0.9,
                     step=0.05,
+                    value=DEFAULT_CHANGE_THRESHOLD,
                     label="Speaker Change Sensitivity",
+                    info="Lower = more sensitive"
                 )
                 max_speakers_slider = gr.Slider(
                     minimum=2,
+                    maximum=ABSOLUTE_MAX_SPEAKERS,
                     step=1,
+                    value=DEFAULT_MAX_SPEAKERS,
+                    label="Maximum Speakers"
                 )
+                update_btn = gr.Button("Update Settings", variant="secondary")
                 # Instructions
                 gr.Markdown("""
+                ## 📋 Instructions
+                1. **Initialize** the system (loads AI models)
+                2. **Start** recording
+                3. **Speak** - system will transcribe and identify speakers
+                4. **Monitor** real-time results below
+                ## 🎨 Speaker Colors
+                - 🔴 Speaker 1 (Red)
+                - 🟢 Speaker 2 (Teal)
+                - 🔵 Speaker 3 (Blue)
+                - 🟡 Speaker 4 (Green)
+                - 🟣 Speaker 5 (Yellow)
+                - 🟤 Speaker 6 (Plum)
+                - 🟫 Speaker 7 (Mint)
+                - 🟨 Speaker 8 (Gold)
                 """)
         # Event handlers
         def on_initialize():
+            result = initialize_system()
+            if "✅" in result:
+                return result, gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
+            else:
+                return result, gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
         def on_start():
+            result = start_recording()
+            return result, gr.update(interactive=False), gr.update(interactive=True)
         def on_stop():
+            result = stop_recording()
+            return result, gr.update(interactive=True), gr.update(interactive=False)
         def on_clear():
+            result = clear_conversation()
+            return result
         def on_update_settings(threshold, max_speakers):
+            result = update_settings(threshold, int(max_speakers))
+            return result
+        def update_display():
+            """Continuously update the conversation display"""
+            conversation = get_conversation()
+            status = get_status()
+            return conversation, status
+        # Button event bindings
         init_btn.click(
+            fn=on_initialize,
+            inputs=[],
+            outputs=[status_output, start_btn, stop_btn, clear_btn]
         )
         start_btn.click(
+            fn=on_start,
+            inputs=[],
             outputs=[status_output, start_btn, stop_btn]
         )
         stop_btn.click(
+            fn=on_stop,
+            inputs=[],
             outputs=[status_output, start_btn, stop_btn]
         )
         clear_btn.click(
+            fn=on_clear,
+            inputs=[],
+            outputs=[status_output]
         )
+        update_btn.click(
+            fn=on_update_settings,
             inputs=[threshold_slider, max_speakers_slider],
             outputs=[status_output]
         )
+        # Auto-refresh conversation display every 500ms
+        interface.load(
+            fn=update_display,
+            inputs=[],
+            outputs=[conversation_output, status_output],
+            every=0.5
         )
     return interface
+# FastAPI integration for FastRTC
 def create_fastapi_app():
+    """Create FastAPI app with FastRTC integration"""
+    app = FastAPI(title="Real-time Speaker Diarization API")
+    @app.get("/")
+    async def root():
+        return {"message": "Real-time Speaker Diarization API"}
+    @app.get("/status")
+    async def api_status():
+        try:
+            if diarization_system.speaker_detector:
+                status = diarization_system.speaker_detector.get_status_info()
+                return {
+                    "initialized": True,
+                    "running": diarization_system.is_running,
+                    "current_speaker": status["current_speaker"],
+                    "active_speakers": status["active_speakers"],
+                    "max_speakers": status["max_speakers"],
+                    "last_similarity": status["last_similarity"],
+                    "threshold": status["threshold"]
+                }
+            else:
+                return {"initialized": False, "running": False}
+        except Exception as e:
+            return {"error": str(e)}
+    @app.get("/conversation")
     async def get_conversation_api():
         try:
             return {
+                "conversation": diarization_system.get_formatted_conversation(),
+                "sentences": len(diarization_system.full_sentences)
             }
         except Exception as e:
+            return {"error": str(e)}
+    @app.post("/initialize")
+    async def initialize_api():
         try:
+            result = initialize_system()
+            return {"message": result, "success": "✅" in result}
+        except Exception as e:
+            return {"error": str(e), "success": False}
+    @app.post("/start")
+    async def start_api():
+        try:
+            result = start_recording()
+            return {"message": result, "success": "🎙️" in result}
+        except Exception as e:
+            return {"error": str(e), "success": False}
+    @app.post("/stop")
+    async def stop_api():
+        try:
+            result = stop_recording()
+            return {"message": result, "success": "⏹️" in result}
+        except Exception as e:
+            return {"error": str(e), "success": False}
+    @app.post("/clear")
+    async def clear_api():
+        try:
+            result = clear_conversation()
+            return {"message": result, "success": True}
         except Exception as e:
+            return {"error": str(e), "success": False}
+    # FastRTC stream endpoint
+    if audio_handler:
+        app.add_websocket_route("/stream", Stream(audio_handler))
     return app
+# Main execution
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Real-time Speaker Diarization System')
+    parser.add_argument('--mode', choices=['gradio', 'api', 'both'], default='gradio',
+                       help='Run mode: gradio interface, API only, or both')
+    parser.add_argument('--port', type=int, default=7860,
+                       help='Port to run on (default: 7860)')
+    parser.add_argument('--host', type=str, default='0.0.0.0',
+                       help='Host to bind to (default: 0.0.0.0)')
+    args = parser.parse_args()
+    if args.mode == 'gradio':
+        # Run Gradio interface only
+        interface = create_interface()
+        interface.launch(
+            server_name=args.host,
+            server_port=args.port,
+            share=True,
+            show_error=True
+        )
+    elif args.mode == 'api':
+        # Run FastAPI only
+        app = create_fastapi_app()
+        uvicorn.run(app, host=args.host, port=args.port)
+    elif args.mode == 'both':
+        # Run both Gradio and FastAPI
+        import threading
+        # Start FastAPI in a separate thread
+        app = create_fastapi_app()
+        api_thread = threading.Thread(
+            target=lambda: uvicorn.run(app, host=args.host, port=args.port + 1),
+            daemon=True
+        )
+        api_thread.start()
+        # Start Gradio interface
+        interface = create_interface()
         interface.launch(
+            server_name=args.host,
+            server_port=args.port,
             share=True,
+            show_error=True
         )
+# Additional utility functions for Hugging Face Spaces
+def setup_for_huggingface():
+    """Setup function specifically for Hugging Face Spaces"""
+    # Auto-initialize when running on HF Spaces
+    try:
+        if os.environ.get('SPACE_ID'):  # Running on HF Spaces
+            logger.info("Running on Hugging Face Spaces - Auto-initializing...")
+            initialize_system()
+            logger.info("System ready for Hugging Face Spaces!")
     except Exception as e:
+        logger.error(f"HF Spaces setup error: {e}")
+# Call setup for HF Spaces
+setup_for_huggingface()
+# For Hugging Face Spaces, create and launch interface directly
+interface = create_interface()
+# Export the interface for HF Spaces
+demo = interface