File size: 3,834 Bytes
6a8ca1f 3068721 6a8ca1f 7ab08cb da6e213 04fc1f1 fc2da7e 3a2d213 fc2da7e 8a039e2 fc2da7e 3a2d213 fc2da7e 3a2d213 fc2da7e 3a2d213 fc2da7e 3a2d213 fc2da7e 3a2d213 fc2da7e 3a2d213 8a039e2 ee5e19e 134e8f7 9ef0967 aae971d 3b88725 b8e5afc db2ea29 8a039e2 b8e5afc 3f71d24 680cfd1 3f71d24 b8e5afc 07b2bd0 3f71d24 b8e5afc 3a2d213 134e8f7 6a8ca1f 7ab08cb 6a8ca1f ee5e19e 471f9af 07b2bd0 6a8ca1f 1635aec 69cfbe8 9ef0967 69cfbe8 6a8ca1f 1635aec 07b2bd0 8a039e2 b1e9279 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import spaces
import torch
import re
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
from PIL import ImageDraw
from torchvision.transforms.v2 import Resize
from transformers import AutoModelForCausalLM
moondream = AutoModelForCausalLM.from_pretrained(
"moondream/moondream3-preview",
trust_remote_code=True,
dtype=torch.bfloat16,
device_map={"": "cuda"},
)
moondream.compile()
def answer_questions(image_tuples, prompt_text):
# Encode image once
image = [img[0] for img in image_tuples if img[0] is not none]
encoded = moondream.encode_image(image)
questions = prompt_text
for q in questions:
result1 = moondream.query(image=encoded, question=q, reasoning=False)
print(f"Q: {q}")
print(f"A: {result1['answer']}\n")
# Also works with other skills
caption = moondream.caption(encoded, length="normal")
objects = moondream.detect(encoded, "poop")
pointe = moondream.point(encoded, "grass")
print(f"caption: {caption}, objects:{objects}, point:{pointe}")
# Segment an object
result2 = moondream.segment(image, "cat")
svg_path = result2["path"]
bbox = result2["bbox"]
print(f"SVG Path: {svg_path[:100]}...")
print(f"Bounding box: {bbox}")
# With spatial hint (point) to guide segmentation
result3 = model.segment(image, "cat", spatial_refs=[[0.5, 0.3]])
print(result1)
# With spatial hint (bounding box)
result3 = model.segment(image, "cat", spatial_refs=[[0.2, 0.1, 0.8, 0.9]])
print(result3)
result = ""
Q_and_A = ""
prompts = [p.strip() for p in prompt_text.split('?')]
image_embeds = [img[0] for img in image_tuples if img[0] is not None]
answers = []
for prompt in prompts:
answers.append(moondream.query(
images=[img.convert("RGB") for img in image_embeds],
prompts=[prompt] * len(image_embeds),
))
for i, prompt in enumerate(prompts):
Q_and_A += f"### Q: {prompt}\n"
for j, image_tuple in enumerate(image_tuples):
image_name = f"image{j+1}"
answer_text = answers[i][j]
Q_and_A += f"**{image_name} A:** \n {answer_text} \n"
result = {'headers': prompts, 'data': answers}
print("result\n{}\n\nQ_and_A\n{}\n\n".format(result, Q_and_A))
return Q_and_A, result
"""
Load Moondream model and tokenizer.
moondream = AutoModelForCausalLM.from_pretrained(
"vikhyatk/moondream2",
revision="2025-01-09",
trust_remote_code=True,
device_map={"": "cuda"},
)
tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
"""
with gr.Blocks() as demo:
gr.Markdown("# moondream2 unofficial batch processing demo")
gr.Markdown("1. Select images\n2. Enter one or more prompts separated by commas. Ex: Describe this image, What is in this image?\n\n")
gr.Markdown("**Currently each image will be sent as a batch with the prompts thus asking each prompt on each image**")
gr.Markdown("A tiny vision language model. [moondream2](https://huggingface.co/vikhyatk/moondream2)")
with gr.Row():
img = gr.Gallery(label="Upload Images", type="pil", preview=True, columns=4)
with gr.Row():
prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts (one prompt for each image provided) separated by question marks. Ex: Describe this image? What is in this image?", lines=8)
with gr.Row():
submit = gr.Button("Submit")
with gr.Row():
output = gr.Markdown(label="Questions and Answers", line_breaks=True)
with gr.Row():
output2 = gr.Dataframe(label="Structured Dataframe", type="array", wrap=True)
submit.click(answer_questions, inputs=[img, prompt], outputs=[output, output2])
demo.queue().launch()
|