Spaces:

prithivMLmods
/

SAM3-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 28 days ago

Commit

28be05f

verified ·

1 Parent(s): 452e070

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -21

app.py CHANGED Viewed

@@ -7,16 +7,16 @@ import numpy as np
 import torch
 import matplotlib
 import matplotlib.pyplot as plt
-from PIL import Image
 from typing import Iterable
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 from transformers import (
     Sam3Model, Sam3Processor,
-    Sam3VideoModel, Sam3VideoProcessor
 )
-# --- THEME CONFIGURATION ---
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
@@ -79,21 +79,25 @@ class CustomBlueTheme(Soft):
 app_theme = CustomBlueTheme()
-# --- GLOBAL MODEL LOADING ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🖥️ Using compute device: {device}")
 print("⏳ Loading SAM3 Models permanently into memory...")
 try:
-    # 1. Load Image Segmentation Model
-    print("   ... Loading Image Model")
     IMG_MODEL = Sam3Model.from_pretrained("facebook/sam3").to(device)
     IMG_PROCESSOR = Sam3Processor.from_pretrained("facebook/sam3")
-    # 2. Load Video Segmentation Model
-    # Using bfloat16 for video to optimize VRAM usage while keeping speed
     print("   ... Loading Video Model")
     VID_MODEL = Sam3VideoModel.from_pretrained("facebook/sam3").to(device, dtype=torch.bfloat16)
     VID_PROCESSOR = Sam3VideoProcessor.from_pretrained("facebook/sam3")
@@ -102,8 +106,10 @@ try:
 except Exception as e:
     print(f"❌ CRITICAL ERROR LOADING MODELS: {e}")
     IMG_MODEL = None
-    VID_MODEL = None
     IMG_PROCESSOR = None
     VID_PROCESSOR = None
@@ -152,21 +158,31 @@ def apply_mask_overlay(base_image, mask_data, opacity=0.5):
     return Image.alpha_composite(base_image, composite_layer).convert("RGB")
-# --- GPU INFERENCE FUNCTIONS ---
 @spaces.GPU
 def run_image_segmentation(source_img, text_query, conf_thresh=0.5):
     if IMG_MODEL is None or IMG_PROCESSOR is None:
-        raise gr.Error("Models failed to load on startup. Check logs.")
     if source_img is None or not text_query:
         raise gr.Error("Please provide an image and a text prompt.")
     try:
         pil_image = source_img.convert("RGB")
-        # Models are already on device, just move inputs
         model_inputs = IMG_PROCESSOR(images=pil_image, text=text_query, return_tensors="pt").to(device)
         with torch.no_grad():
@@ -179,7 +195,6 @@ def run_image_segmentation(source_img, text_query, conf_thresh=0.5):
             target_sizes=model_inputs.get("original_sizes").tolist()
         )[0]
-        # Use AnnotatedImage format
         annotation_list = []
         raw_masks = processed_results['masks'].cpu().numpy()
         raw_scores = processed_results['scores'].cpu().numpy()
@@ -193,6 +208,50 @@ def run_image_segmentation(source_img, text_query, conf_thresh=0.5):
     except Exception as e:
         raise gr.Error(f"Error during image processing: {e}")
 def calc_timeout_duration(vid_file, *args):
     return args[-1] if args else 60
@@ -219,7 +278,6 @@ def run_video_segmentation(source_vid, text_query, frame_limit, time_limit):
             counter += 1
         video_cap.release()
-        # VID_MODEL is already on device in bfloat16
         session = VID_PROCESSOR.init_video_session(video=video_frames, inference_device=device, dtype=torch.bfloat16)
         session = VID_PROCESSOR.add_text_prompt(inference_session=session, text=text_query)
@@ -246,16 +304,15 @@ def run_video_segmentation(source_vid, text_query, frame_limit, time_limit):
     except Exception as e:
         return None, f"Error during video processing: {str(e)}"
-# --- GUI ---
 custom_css="""
 #col-container { margin: 0 auto; max-width: 1100px; }
 #main-title h1 { font-size: 2.1em !important; }
 """
-with gr.Blocks(css=custom_css, theme=app_theme) as main_interface:
     with gr.Column(elem_id="col-container"):
         gr.Markdown("# **SAM3: Segment Anything Model 3**", elem_id="main-title")
-        gr.Markdown("Segment objects in image or video using **SAM3** (Segment Anything Model 3) with text prompts.")
         with gr.Tabs():
             with gr.Tab("Image Segmentation"):
@@ -287,7 +344,7 @@ with gr.Blocks(css=custom_css, theme=app_theme) as main_interface:
                             inputs=[image_input, txt_prompt_img, conf_slider],
                             outputs=[image_result]
                         )
             with gr.Tab("Video Segmentation"):
                 with gr.Row():
                     with gr.Column():
@@ -320,6 +377,31 @@ with gr.Blocks(css=custom_css, theme=app_theme) as main_interface:
                     inputs=[video_input, txt_prompt_vid, frame_limiter, time_limiter],
                     outputs=[video_result, process_status]
                 )
 if __name__ == "__main__":
-    main_interface.launch(ssr_mode=False, mcp_server=True, show_error=True)

 import torch
 import matplotlib
 import matplotlib.pyplot as plt
+from PIL import Image, ImageDraw
 from typing import Iterable
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 from transformers import (
     Sam3Model, Sam3Processor,
+    Sam3VideoModel, Sam3VideoProcessor,
+    Sam3TrackerModel, Sam3TrackerProcessor
 )
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
 app_theme = CustomBlueTheme()
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🖥️ Using compute device: {device}")
 print("⏳ Loading SAM3 Models permanently into memory...")
 try:
+    # 1. Load Image Segmentation Model (Text)
+    print("   ... Loading Image Text Model")
     IMG_MODEL = Sam3Model.from_pretrained("facebook/sam3").to(device)
     IMG_PROCESSOR = Sam3Processor.from_pretrained("facebook/sam3")
+    # 2. Load Image Tracker Model (Click)
+    print("   ... Loading Image Tracker Model")
+    TRK_MODEL = Sam3TrackerModel.from_pretrained("facebook/sam3").to(device)
+    TRK_PROCESSOR = Sam3TrackerProcessor.from_pretrained("facebook/sam3")
+    # 3. Load Video Segmentation Model
     print("   ... Loading Video Model")
+    # Using bfloat16 for video to optimize VRAM
     VID_MODEL = Sam3VideoModel.from_pretrained("facebook/sam3").to(device, dtype=torch.bfloat16)
     VID_PROCESSOR = Sam3VideoProcessor.from_pretrained("facebook/sam3")
 except Exception as e:
     print(f"❌ CRITICAL ERROR LOADING MODELS: {e}")
     IMG_MODEL = None
     IMG_PROCESSOR = None
+    TRK_MODEL = None
+    TRK_PROCESSOR = None
+    VID_MODEL = None
     VID_PROCESSOR = None
     return Image.alpha_composite(base_image, composite_layer).convert("RGB")
+def draw_points_on_image(image, points):
+    """Draws red dots on the image to indicate click locations."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    draw_img = image.copy()
+    draw = ImageDraw.Draw(draw_img)
+    for pt in points:
+        x, y = pt
+        r = 6 # Radius of point
+        draw.ellipse((x-r, y-r, x+r, y+r), fill="red", outline="white", width=2)
+    return draw_img
 @spaces.GPU
 def run_image_segmentation(source_img, text_query, conf_thresh=0.5):
     if IMG_MODEL is None or IMG_PROCESSOR is None:
+        raise gr.Error("Models failed to load on startup.")
     if source_img is None or not text_query:
         raise gr.Error("Please provide an image and a text prompt.")
     try:
         pil_image = source_img.convert("RGB")
         model_inputs = IMG_PROCESSOR(images=pil_image, text=text_query, return_tensors="pt").to(device)
         with torch.no_grad():
             target_sizes=model_inputs.get("original_sizes").tolist()
         )[0]
         annotation_list = []
         raw_masks = processed_results['masks'].cpu().numpy()
         raw_scores = processed_results['scores'].cpu().numpy()
     except Exception as e:
         raise gr.Error(f"Error during image processing: {e}")
+@spaces.GPU
+def run_image_click_gpu(input_image, x, y, points_state, labels_state):
+    if TRK_MODEL is None or TRK_PROCESSOR is None:
+        raise gr.Error("Tracker Model failed to load.")
+    if input_image is None: return input_image, [], []
+    if points_state is None: points_state = []; labels_state = []
+    # Append new point
+    points_state.append([x, y])
+    labels_state.append(1) # 1 indicates a positive click (foreground)
+    try:
+        # Prepare inputs format: [Batch, Point_Group, Point_Idx, Coord]
+        input_points = [[points_state]]
+        input_labels = [[labels_state]]
+        inputs = TRK_PROCESSOR(images=input_image, input_points=input_points, input_labels=input_labels, return_tensors="pt").to(device)
+        with torch.no_grad():
+            # multimask_output=True usually helps with ambiguity, but let's default to best mask for simplicity here
+            outputs = TRK_MODEL(**inputs, multimask_output=False)
+        # Post process
+        masks = TRK_PROCESSOR.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"], binarize=True)[0]
+        # Overlay mask
+        # masks shape is [1, 1, H, W] for single object tracking
+        final_img = apply_mask_overlay(input_image, masks[0])
+        # Draw the visual points on top
+        final_img = draw_points_on_image(final_img, points_state)
+        return final_img, points_state, labels_state
+    except Exception as e:
+        print(f"Tracker Error: {e}")
+        return input_image, points_state, labels_state
+def image_click_handler(image, evt: gr.SelectData, points_state, labels_state):
+    # Wrapper to handle the Gradio select event
+    x, y = evt.index
+    return run_image_click_gpu(image, x, y, points_state, labels_state)
 def calc_timeout_duration(vid_file, *args):
     return args[-1] if args else 60
             counter += 1
         video_cap.release()
         session = VID_PROCESSOR.init_video_session(video=video_frames, inference_device=device, dtype=torch.bfloat16)
         session = VID_PROCESSOR.add_text_prompt(inference_session=session, text=text_query)
     except Exception as e:
         return None, f"Error during video processing: {str(e)}"
 custom_css="""
 #col-container { margin: 0 auto; max-width: 1100px; }
 #main-title h1 { font-size: 2.1em !important; }
 """
+with gr.Blocks(css=custom_css, theme=app_theme) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown("# **SAM3: Segment Anything Model 3**", elem_id="main-title")
+        gr.Markdown("Segment objects in image or video using **SAM3** with Text Prompts or Interactive Clicks.")
         with gr.Tabs():
             with gr.Tab("Image Segmentation"):
                             inputs=[image_input, txt_prompt_img, conf_slider],
                             outputs=[image_result]
                         )
             with gr.Tab("Video Segmentation"):
                 with gr.Row():
                     with gr.Column():
                     inputs=[video_input, txt_prompt_vid, frame_limiter, time_limiter],
                     outputs=[video_result, process_status]
                 )
+            with gr.Tab("Image Click Segmentation"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        img_click_input = gr.Image(type="pil", label="Input Image (Click points)", interactive=True, height=450)
+                        with gr.Row():
+                            img_click_clear = gr.Button("Clear Points & Reset", variant="secondary")
+                        st_click_points = gr.State([])
+                        st_click_labels = gr.State([])
+                    with gr.Column(scale=1):
+                        img_click_output = gr.Image(type="pil", label="Result Preview", height=450, interactive=False)
+                img_click_input.select(
+                    image_click_handler,
+                    inputs=[img_click_input, st_click_points, st_click_labels],
+                    outputs=[img_click_output, st_click_points, st_click_labels]
+                )
+                img_click_clear.click(
+                    lambda: (None, [], []),
+                    outputs=[img_click_output, st_click_points, st_click_labels]
+                )
 if __name__ == "__main__":
+    demo.launch(ssr_mode=False, mcp_server=True, show_error=True)