Spaces:
Sleeping
Sleeping
| from llama_cpp import Llama | |
| import whisper | |
| from TTS.api import TTS | |
| import numpy as np | |
| import gradio as gr | |
| from gradio_unifiedaudio import UnifiedAudio | |
| from pathlib import Path | |
| import torch | |
| from scipy.io import wavfile | |
| from collections import deque | |
| whisper_model = whisper.load_model("base") | |
| llm = Llama.from_pretrained( | |
| repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF", | |
| filename="*q8_0.gguf", | |
| verbose=False | |
| ) | |
| tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False) | |
| dir_ = Path(__file__).parent | |
| instream = None | |
| def detect_pause(instream, energy_threshold=800, pause_duration=2.0, sample_rate=16000): | |
| pause_samples = int(pause_duration * sample_rate) | |
| energy = np.abs(instream[1]) | |
| window = deque(maxlen=pause_samples) | |
| for i, e in enumerate(energy): | |
| window.append(e < energy_threshold) | |
| if len(window) == pause_samples and all(window): | |
| return True | |
| return False | |
| def add_to_stream(audio, instream, pause_detected): | |
| if instream is None: | |
| ret = audio | |
| else: | |
| ret = (audio[0], np.concatenate((instream[1], audio[1]))) | |
| if detect_pause(instream): | |
| pause_detected = True | |
| stop_recording(ret) | |
| return audio, ret, pause_detected | |
| def stop_recording(audio): | |
| wavfile.write("user_output.wav", audio[0], audio[1]) | |
| text = whisper_model.transcribe("user_output.wav")['text'] | |
| print(f"You said: {text}") | |
| if text.lower() in ["exit", "quit", "stop"]: | |
| print("Voice Assistant is shutting down.") | |
| response = generate_response(text) | |
| print(f"Assistant: {response}") | |
| return UnifiedAudio(value=speak_text(response), streaming=False) | |
| def stop_playing(): | |
| pause_detected = False | |
| return UnifiedAudio(value=None, streaming=True), None, pause_detected | |
| def transcribe_audio(audio_data): | |
| return whisper_model.transcribe("user_output.wav", language='en')['text'] | |
| def generate_response(prompt): | |
| response = llm(prompt=prompt) | |
| return response['choices'][0]['text'].strip() | |
| def speak_text(text): | |
| tts.tts_to_file(text=text.strip(), file_path="bot_output.wav") | |
| return "bot_output.wav" | |
| with gr.Blocks() as demo: | |
| mic = UnifiedAudio(sources=["microphone"], streaming=True) | |
| stream = gr.State() | |
| pause_detected = gr.State(False) | |
| mic.stop_recording(stop_recording, stream, mic) | |
| mic.end(stop_playing, None, [mic, stream, pause_detected]) | |
| mic.stream(add_to_stream, [mic, stream, pause_detected], [mic, stream, pause_detected]) | |
| # @gr.render(inputs=[mic, stream, pause_detected]) | |
| # def recording_paused(microphone, stream, pause_detected): | |
| # if pause_detected: | |
| # stop_recording(stream) | |
| if __name__ == '__main__': | |
| demo.launch() | |