a9's picture
Update app.py
a429a2e verified
import gradio as gr
import torch
import os
import time
import copy
from pathlib import Path
from typing import Optional, Tuple
import spaces
from vibevoice.modular.modeling_vibevoice_streaming_inference import (
VibeVoiceStreamingForConditionalGenerationInference,
)
from vibevoice.processor.vibevoice_streaming_processor import (
VibeVoiceStreamingProcessor,
)
class VoiceMapper:
"""Maps speaker names to voice file paths"""
def __init__(self):
self.setup_voice_presets()
# Change name according to our preset voice file
new_dict = {}
for name, path in self.voice_presets.items():
if "_" in name:
name = name.split("_")[0]
if "-" in name:
name = name.split("-")[-1]
new_dict[name] = path
self.voice_presets.update(new_dict)
def setup_voice_presets(self):
"""Setup voice presets by scanning the voices directory."""
voices_dir = os.path.join(os.path.dirname(__file__), "demo/voices/streaming_model")
# Check if voices directory exists
if not os.path.exists(voices_dir):
print(f"Warning: Voices directory not found at {voices_dir}")
self.voice_presets = {}
self.available_voices = {}
return
# Scan for all VOICE files in the voices directory
self.voice_presets = {}
# Get all .pt files in the voices directory
pt_files = [
f
for f in os.listdir(voices_dir)
if f.lower().endswith(".pt") and os.path.isfile(os.path.join(voices_dir, f))
]
# Create dictionary with filename (without extension) as key
for pt_file in pt_files:
# Remove .pt extension to get the name
name = os.path.splitext(pt_file)[0]
# Create full path
full_path = os.path.join(voices_dir, pt_file)
self.voice_presets[name] = full_path
# Sort the voice presets alphabetically by name for better UI
self.voice_presets = dict(sorted(self.voice_presets.items()))
# Filter out voices that don't exist (this is now redundant but kept for safety)
self.available_voices = {
name: path for name, path in self.voice_presets.items() if os.path.exists(path)
}
print(f"Found {len(self.available_voices)} voice files in {voices_dir}")
print(f"Available voices: {', '.join(self.available_voices.keys())}")
def get_voice_path(self, speaker_name: str) -> str:
"""Get voice file path for a given speaker name"""
# First try exact match
if speaker_name in self.voice_presets:
return self.voice_presets[speaker_name]
# Try partial matching (case insensitive)
speaker_lower = speaker_name.lower()
for preset_name, path in self.voice_presets.items():
if preset_name.lower() in speaker_lower or speaker_lower in preset_name.lower():
return path
# Default to first voice if no match found
default_voice = list(self.voice_presets.values())[0]
print(
f"Warning: No voice preset found for '{speaker_name}', using default voice: {default_voice}"
)
return default_voice
# Load model and processor directly
print("Loading VibeVoice-Realtime model...")
MODEL_PATH = "microsoft/VibeVoice-Realtime-0.5B"
# Load processor (CPU operation)
PROCESSOR = VibeVoiceStreamingProcessor.from_pretrained(MODEL_PATH)
# Load model on CPU initially (will be moved to GPU by @spaces.GPU decorator)
MODEL = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
MODEL_PATH,
torch_dtype=torch.float16,
device_map="cpu",
attn_implementation="sdpa",
)
MODEL.eval()
MODEL.set_ddpm_inference_steps(num_steps=5)
# Initialize voice mapper
VOICE_MAPPER = VoiceMapper()
print("Model loaded successfully!")
@spaces.GPU(duration=60) # Request GPU for 60 seconds
def generate_speech(
text: str,
speaker_name: str,
cfg_scale: float = 1.5,
progress=gr.Progress(),
) -> Tuple[Optional[str], str]:
"""
Generate speech from text using VibeVoice-Realtime with ZeroGPU
Args:
text: Input text to convert to speech
speaker_name: Name of the speaker voice to use
cfg_scale: Classifier-Free Guidance scale (higher = more faithful to text)
progress: Gradio progress tracker
Returns:
Tuple of (audio_path, status_message)
"""
if not text or not text.strip():
return None, "❌ Error: Please enter some text to convert to speech."
try:
progress(0, desc="Loading voice preset...")
# Clean text
full_script = text.strip().replace("'", "'").replace('"', '"').replace('"', '"')
# Get voice sample
voice_sample = VOICE_MAPPER.get_voice_path(speaker_name)
# Load voice sample to GPU
all_prefilled_outputs = torch.load(
voice_sample, map_location="cpu", weights_only=False
)
progress(0.2, desc="Preparing inputs...")
# Prepare inputs
inputs = PROCESSOR.process_input_with_cached_prompt(
text=full_script,
cached_prompt=all_prefilled_outputs,
padding=True,
return_tensors="pt",
return_attention_mask=True,
)
# Move model and tensors to GPU
MODEL.to("cpu")
for k, v in inputs.items():
if torch.is_tensor(v):
inputs[k] = v.to("cpu")
progress(0.4, desc="Generating speech on GPU...")
# Generate audio
start_time = time.time()
with torch.cpu.amp.autocast(): # Enable automatic mixed precision
outputs = MODEL.generate(
**inputs,
max_new_tokens=None,
cfg_scale=cfg_scale,
tokenizer=PROCESSOR.tokenizer,
generation_config={"do_sample": False},
verbose=False,
all_prefilled_outputs=copy.deepcopy(all_prefilled_outputs)
if all_prefilled_outputs is not None
else None,
)
generation_time = time.time() - start_time
progress(0.8, desc="Saving audio...")
# Calculate metrics
if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
sample_rate = 24000
audio_samples = (
outputs.speech_outputs[0].shape[-1]
if len(outputs.speech_outputs[0].shape) > 0
else len(outputs.speech_outputs[0])
)
audio_duration = audio_samples / sample_rate
rtf = generation_time / audio_duration if audio_duration > 0 else float("inf")
# Save output
output_dir = "./outputs"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"generated_{int(time.time())}.wav")
PROCESSOR.save_audio(
outputs.speech_outputs[0].cpu(), # Move to CPU for saving
output_path=output_path,
)
progress(1.0, desc="Complete!")
# Create status message
status = f"""✅ **Generation Complete!**
📊 **Metrics:**
- Audio Duration: {audio_duration:.2f}s
- Generation Time: {generation_time:.2f}s
- Real-Time Factor: {rtf:.2f}x
- Speaker: {speaker_name}
- CFG Scale: {cfg_scale}
- Device: ZeroGPU (CUDA)
"""
# Move model back to CPU to free GPU memory
MODEL.to("cpu")
return output_path, status
else:
MODEL.to("cpu")
return None, "❌ Error: No audio output generated."
except Exception as e:
import traceback
error_msg = f"❌ Error during generation:\n{str(e)}\n\n{traceback.format_exc()}"
print(error_msg)
# Clean up GPU memory on error
try:
MODEL.to("cpu")
except:
pass
return None, error_msg
# Create Gradio interface
with gr.Blocks(fill_height=True) as demo:
gr.Markdown(
"""
# 🎙️ VibeVoice-Realtime Text-to-Speech
Convert text to natural-sounding speech using Microsoft's VibeVoice-Realtime model.
**🚀 Powered by ZeroGPU** - Efficient GPU allocation for fast inference!
<div style="text-align: center; margin-top: 10px;">
<a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="text-decoration: none; color: #4F46E5; font-weight: 600;">
Built with anycoder ✨
</a>
</div>
"""
)
with gr.Row():
with gr.Column(scale=2):
# Input section
text_input = gr.Textbox(
label="Text to Convert",
placeholder="Enter the text you want to convert to speech...",
lines=8,
max_lines=20,
)
with gr.Row():
speaker_dropdown = gr.Dropdown(
choices=list(VOICE_MAPPER.available_voices.keys()),
value=list(VOICE_MAPPER.available_voices.keys())[0]
if VOICE_MAPPER.available_voices
else None,
label="Speaker Voice",
info="Select the voice to use for speech generation",
)
cfg_slider = gr.Slider(
minimum=1.0,
maximum=3.0,
value=1.5,
step=0.1,
label="CFG Scale",
info="Higher values = more faithful to text (1.0-3.0)",
)
generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
# Output section
audio_output = gr.Audio(
label="Generated Speech",
type="filepath",
interactive=False,
)
status_output = gr.Markdown(
"""
**Status:** Ready to generate speech
Enter text and click "Generate Speech" to start.
⚡ Using ZeroGPU for efficient processing
"""
)
# Example inputs
gr.Examples(
examples=[
[
"VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio.",
list(VOICE_MAPPER.available_voices.keys())[0]
if VOICE_MAPPER.available_voices
else "Wayne",
1.5,
],
[
"The quick brown fox jumps over the lazy dog. This is a test of the text-to-speech system.",
list(VOICE_MAPPER.available_voices.keys())[0]
if VOICE_MAPPER.available_voices
else "Wayne",
1.5,
],
],
inputs=[text_input, speaker_dropdown, cfg_slider],
label="Example Inputs",
)
# Event handlers
generate_btn.click(
fn=generate_speech,
inputs=[text_input, speaker_dropdown, cfg_slider],
outputs=[audio_output, status_output],
api_name="generate",
)
# Footer
gr.Markdown(
"""
---
### 📝 Notes:
- **Model**: Microsoft VibeVoice-Realtime-0.5B
- **Sample Rate**: 24kHz
- **Context Length**: 8K tokens
- **Generation Length**: ~10 minutes
- **Infrastructure**: ZeroGPU (Hugging Face Spaces)
### ⚠️ Important:
- The model is designed for English text only
- Very short inputs (< 3 words) may produce unstable results
- Code, formulas, and special symbols are not supported
- Please use responsibly and disclose AI-generated content
- GPU is allocated dynamically - generation may take a few seconds to start
"""
)
# Launch the app with Gradio 6 syntax
if __name__ == "__main__":
demo.launch(
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="indigo",
neutral_hue="slate",
),
footer_links=[
{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}
],
)