Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,563 Bytes
5e1305b 05ee77e 5e1305b be1c6d2 1e44906 5e1305b 25046bc 5e1305b 25046bc dbd0da9 be1c6d2 b5d95a4 0ae8239 b5d95a4 be1c6d2 5e1305b be1c6d2 5e1305b 6b601a2 5e1305b 0ae8239 5e1305b c9b2982 5e1305b 6b601a2 0ae8239 4e26a0e 25046bc dbd0da9 49c1fbc 05ee77e 7022707 49c1fbc 0ae8239 05ee77e dbd0da9 05ee77e dbd0da9 05ee77e 49c1fbc 05ee77e e14a091 7359698 05ee77e 25046bc 6b601a2 5e1305b 25046bc 5e1305b be1c6d2 5e1305b b5d95a4 5e1305b 7359698 5e1305b 0ae8239 be1c6d2 5e1305b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch
import spaces
import re
# Initialize the model and tokenizer
print("Loading VibeThinker model...")
model = AutoModelForCausalLM.from_pretrained(
"WeiboAI/VibeThinker-1.5B",
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
"WeiboAI/VibeThinker-1.5B",
trust_remote_code=True
)
print("Model loaded successfully!")
@spaces.GPU
def respond(message, history):
"""
Generate streaming response for the chatbot.
Args:
message: The user's current message
history: List of previous conversation messages
"""
# Build messages from history
messages = history if history else []
# Add current message
messages.append({"role": "user", "content": message})
# Apply chat template
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Tokenize
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
# Generation config - using dict format as in official docs
generation_config = dict(
max_new_tokens=4000,
do_sample=True,
temperature=0.6,
top_p=0.95,
top_k=None
)
# Generate - passing GenerationConfig exactly as in docs
generated_ids = model.generate(
**model_inputs,
generation_config=GenerationConfig(**generation_config)
)
# Trim input from output - exactly as in official docs
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
# Decode - skip special tokens will help but we'll also filter manually
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
# For streaming effect, yield character by character
partial_response = ""
for char in response:
partial_response += char
yield partial_response
# Create the Gradio interface
with gr.Blocks(
theme=gr.themes.Soft(),
css="""
.header-link { text-decoration: none; color: inherit; }
.header-link:hover { text-decoration: underline; }
"""
) as demo:
gr.Markdown(
"""
# 💭 VibeThinker Chatbot
Chat with [WeiboAI/VibeThinker-1.5B](https://huggingface.co/WeiboAI/VibeThinker-1.5B) - a powerful conversational AI model.
<a href="https://huggingface.co/spaces/akhaliq/anycoder" class="header-link">Built with anycoder</a>
"""
)
gr.ChatInterface(
fn=respond,
type="messages",
title="",
description="Ask me anything! I'm powered by VibeThinker with ZeroGPU acceleration.",
examples=[
"What is 2 + 2?",
"Tell me a short joke",
"What is the capital of France?",
"Explain AI in one sentence",
],
cache_examples=False,
chatbot=gr.Chatbot(allow_tags=["think"]),
)
gr.Markdown(
"""
### About VibeThinker
VibeThinker is a 1.5B parameter conversational AI model designed for engaging and thoughtful conversations.
The model uses temperature sampling (0.6) for balanced creativity and coherence.
**Powered by ZeroGPU** for efficient GPU resource allocation.
"""
)
if __name__ == "__main__":
demo.launch() |