| | import gradio as gr |
| | from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration |
| | from PIL import Image |
| |
|
| | |
| | model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-screen2words-large") |
| | processor = Pix2StructProcessor.from_pretrained("google/pix2struct-screen2words-large") |
| |
|
| | |
| | def describe_ui(image): |
| | inputs = processor(images=image, return_tensors="pt") |
| | outputs = model.generate(**inputs) |
| | return processor.decode(outputs[0], skip_special_tokens=True) |
| |
|
| | |
| | gr.Interface( |
| | fn=describe_ui, |
| | inputs=gr.Image(type="pil"), |
| | outputs="text", |
| | title="UI Screen Describer (Pix2Struct)", |
| | description="Upload a screenshot or UI image and get an automatic description powered by Google’s Pix2Struct model." |
| | ).launch() |
| |
|