Spaces:

akhaliq
/

VibeThinker-1.5B

Running on Zero

File size: 3,563 Bytes

5e1305b
05ee77e
5e1305b
be1c6d2
1e44906
5e1305b
 
25046bc
5e1305b
25046bc
 
dbd0da9
be1c6d2
b5d95a4
0ae8239
b5d95a4
 
 
 
be1c6d2
5e1305b
 
 
be1c6d2
5e1305b
 
6b601a2
5e1305b
 
 
0ae8239
5e1305b
c9b2982
 
5e1305b
 
 
 
6b601a2
0ae8239
 
 
 
 
4e26a0e
25046bc
dbd0da9
49c1fbc
05ee77e
 
7022707
49c1fbc
0ae8239
 
05ee77e
dbd0da9
 
05ee77e
 
dbd0da9
05ee77e
49c1fbc
 
05ee77e
 
 
 
e14a091
7359698
05ee77e
25046bc
6b601a2
 
 
 
 
5e1305b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25046bc
5e1305b
 
 
be1c6d2
5e1305b
b5d95a4
 
 
 
5e1305b
 
7359698
5e1305b
 
 
 
 
 
0ae8239
be1c6d2
 
5e1305b

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch
import spaces
import re


# Initialize the model and tokenizer
print("Loading VibeThinker model...")
model = AutoModelForCausalLM.from_pretrained(
    "WeiboAI/VibeThinker-1.5B",
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
    "WeiboAI/VibeThinker-1.5B", 
    trust_remote_code=True
)
print("Model loaded successfully!")


@spaces.GPU
def respond(message, history):
    """
    Generate streaming response for the chatbot.
    
    Args:
        message: The user's current message
        history: List of previous conversation messages
    """
    # Build messages from history
    messages = history if history else []
    
    # Add current message
    messages.append({"role": "user", "content": message})
    
    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    # Generation config - using dict format as in official docs
    generation_config = dict(
        max_new_tokens=4000,
        do_sample=True,
        temperature=0.6,
        top_p=0.95,
        top_k=None
    )
    
    # Generate - passing GenerationConfig exactly as in docs
    generated_ids = model.generate(
        **model_inputs,
        generation_config=GenerationConfig(**generation_config)
    )
    
    # Trim input from output - exactly as in official docs
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    # Decode - skip special tokens will help but we'll also filter manually
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    # For streaming effect, yield character by character
    partial_response = ""
    for char in response:
        partial_response += char
        yield partial_response


# Create the Gradio interface
with gr.Blocks(
    theme=gr.themes.Soft(),
    css="""
    .header-link { text-decoration: none; color: inherit; }
    .header-link:hover { text-decoration: underline; }
    """
) as demo:
    gr.Markdown(
        """
        # 💭 VibeThinker Chatbot
        Chat with [WeiboAI/VibeThinker-1.5B](https://huggingface.co/WeiboAI/VibeThinker-1.5B) - a powerful conversational AI model.
        
        <a href="https://huggingface.co/spaces/akhaliq/anycoder" class="header-link">Built with anycoder</a>
        """
    )
    
    gr.ChatInterface(
        fn=respond,
        type="messages",
        title="",
        description="Ask me anything! I'm powered by VibeThinker with ZeroGPU acceleration.",
        examples=[
            "What is 2 + 2?",
            "Tell me a short joke",
            "What is the capital of France?",
            "Explain AI in one sentence",
        ],
        cache_examples=False,
        chatbot=gr.Chatbot(allow_tags=["think"]),
    )
    
    gr.Markdown(
        """
        ### About VibeThinker
        VibeThinker is a 1.5B parameter conversational AI model designed for engaging and thoughtful conversations.
        The model uses temperature sampling (0.6) for balanced creativity and coherence.
        
        **Powered by ZeroGPU** for efficient GPU resource allocation.
        """
    )

if __name__ == "__main__":
    demo.launch()