Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| import requests | |
| from io import BytesIO | |
| from transformers import Qwen3VLForConditionalGeneration, AutoProcessor | |
| # --- Configuration --- | |
| MODEL_PATH = "zhangbaoxin/qwen3-vl-2b-package_unsloth_finetune" | |
| CPU_DEVICE = "cpu" | |
| # --- Model and Processor Loading --- | |
| print("Loading model and processor... This will take a few minutes on a CPU.") | |
| processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True) | |
| model = Qwen3VLForConditionalGeneration.from_pretrained( | |
| MODEL_PATH, | |
| trust_remote_code=True, | |
| dtype="auto", # Use 'auto' for dtype for better compatibility | |
| device_map="auto" # This is the key for CPU (and GPU) compatibility | |
| ) | |
| print("Model and processor loaded successfully.") | |
| # --- Inference Function --- | |
| def process_and_generate(image_input, text_prompt): | |
| """ | |
| Processes the image and text prompt, and generates a response from the model. | |
| """ | |
| if image_input is None or not text_prompt.strip(): | |
| return "Please provide both an image and a text prompt." | |
| # Convert Gradio's numpy array to a PIL Image | |
| pil_image = Image.fromarray(image_input) | |
| # Prepare the messages payload for the model | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": pil_image}, | |
| {"type": "text", "text": text_prompt}, | |
| ], | |
| } | |
| ] | |
| print("Processing inputs and generating response... This will be slow.") | |
| try: | |
| # Preparation for inference | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_dict=True, | |
| return_tensors="pt" | |
| ) | |
| inputs = inputs.to(model.device) | |
| # Inference: Generation of the output | |
| generated_ids = model.generate(**inputs, max_new_tokens=1024) | |
| # To get only the new tokens, we trim the input IDs from the generated IDs | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| # Decode the trimmed IDs to text | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| ) | |
| # batch_decode returns a list, we return the first element | |
| return output_text[0] | |
| except Exception as e: | |
| return f"An error occurred during generation: {str(e)}" | |
| # --- Gradio Interface --- | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # Qwen3-VL-2B-Instruct CPU Demo | |
| This Space runs the `Qwen/Qwen3-VL-2B-Instruct` model using the standard `transformers` library. | |
| **Warning:** Running this on a free CPU Space is **very slow**. Duplicate this space for solo experience. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(type="numpy", label="Upload Image") | |
| text_prompt = gr.Textbox(label="Prompt", value="Question:\n此图片有包裹吗?\nOptions:\nA. 有包裹.\nB. 没有包裹(地毯,消防栓等).\nPlease select the correct answer from the options above.") | |
| submit_button = gr.Button("Generate Response") | |
| with gr.Column(): | |
| output_text = gr.Textbox(label="Model Output", lines=10, interactive=False) | |
| submit_button.click( | |
| fn=process_and_generate, | |
| inputs=[image_input, text_prompt], | |
| outputs=output_text | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |