Spaces:

laolida-w
/

1

Sleeping

App Files Files Community

laolida-w commited on 16 days ago

Commit

b28fb05

verified ·

1 Parent(s): b7aaab4

Upload 5 files

Browse files

Files changed (5) hide show

README.md +1 -2
app.py +224 -0
eval_dataset.py +283 -0
requirements.txt +17 -0
temp.py +7 -0

README.md CHANGED Viewed

@@ -8,5 +8,4 @@ sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: false
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import sys
+from eval_dataset import SingleRegionCaptionDataset
+from segment_anything import sam_model_registry, SamPredictor
+import gradio as gr
+import numpy as np
+import cv2
+import base64
+import torch
+from PIL import Image
+import io
+import argparse
+from fastapi import FastAPI
+from fastapi.staticfiles import StaticFiles
+from transformers import AutoModel, AutoProcessor, GenerationConfig
+from transformers import SamModel, SamProcessor
+try:
+    from spaces import GPU
+except ImportError:
+    print("Spaces not installed, using dummy GPU decorator")
+    def GPU(*args, **kwargs):
+        def decorator(fn):
+            return fn
+        return decorator
+# Load SAM model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+sam_model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
+sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+print("sam ready")
+model_path = "HaochenWang/GAR-1B"
+# Initialize the captioning model and processor
+model = AutoModel.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    device_map="cuda:0",
+).eval()
+processor = AutoProcessor.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+)
+@GPU(duration=75)
+def image_to_sam_embedding(base64_image):
+    try:
+        # Decode base64 string to bytes
+        image_bytes = base64.b64decode(base64_image)
+        # Convert bytes to PIL Image
+        image = Image.open(io.BytesIO(image_bytes))
+        # Process image with SAM processor
+        inputs = sam_processor(image, return_tensors="pt").to(device)
+        # Get image embedding
+        with torch.no_grad():
+            image_embedding = sam_model.get_image_embeddings(inputs["pixel_values"])
+        # Convert to CPU and numpy
+        image_embedding = image_embedding.cpu().numpy()
+        # Encode the embedding as base64
+        embedding_bytes = image_embedding.tobytes()
+        embedding_base64 = base64.b64encode(embedding_bytes).decode('utf-8')
+        return embedding_base64
+    except Exception as e:
+        print(f"Error processing image: {str(e)}")
+        raise gr.Error(f"Failed to process image: {str(e)}")
+@GPU(duration=75)
+def describe(image_base64: str, mask_base64: str, query: str):
+    # Convert base64 to PIL Image
+    image_bytes = base64.b64decode(image_base64.split(',')[1] if ',' in image_base64 else image_base64)
+    img = Image.open(io.BytesIO(image_bytes))
+    mask_bytes = base64.b64decode(mask_base64.split(',')[1] if ',' in mask_base64 else mask_base64)
+    mask = Image.open(io.BytesIO(mask_bytes))
+    mask = np.array(mask.convert('L'))
+    prompt_number = model.config.prompt_numbers
+    prompt_tokens = [f"<Prompt{i_p}>" for i_p in range(prompt_number)] + ["<NO_Prompt>"]
+    # Assuming mask is given as a numpy array and the image is a PIL image
+    dataset = SingleRegionCaptionDataset(
+        image=img,
+        mask=mask,
+        processor=processor,
+        prompt_number=prompt_number,
+        visual_prompt_tokens=prompt_tokens,
+        data_dtype=torch.bfloat16,
+    )
+    data_sample = dataset[0]
+    # Generate the caption
+    with torch.no_grad():
+        generate_ids = model.generate(
+            **data_sample,
+            generation_config=GenerationConfig(
+                max_new_tokens=1024,
+                # do_sample= False,
+                eos_token_id=processor.tokenizer.eos_token_id,
+                pad_token_id=processor.tokenizer.pad_token_id,
+            ),
+            return_dict=True,
+        )
+    output_caption = processor.tokenizer.decode(generate_ids.sequences[0], skip_special_tokens=True).strip()
+    # Stream the tokens
+    text = ""
+    for token in output_caption:
+        text += token
+        yield text
+@GPU(duration=75)
+def describe_without_streaming(image_base64: str, mask_base64: str, query: str):
+    # Convert base64 to PIL Image
+    image_bytes = base64.b64decode(image_base64.split(',')[1] if ',' in image_base64 else image_base64)
+    img = Image.open(io.BytesIO(image_bytes))
+    mask_bytes = base64.b64decode(mask_base64.split(',')[1] if ',' in mask_base64 else mask_base64)
+    mask = Image.open(io.BytesIO(mask_bytes))
+    mask = np.array(mask.convert('L'))
+    prompt_number = model.config.prompt_numbers
+    prompt_tokens = [f"<Prompt{i_p}>" for i_p in range(prompt_number)] + ["<NO_Prompt>"]
+    # Assuming mask is given as a numpy array and the image is a PIL image
+    dataset = SingleRegionCaptionDataset(
+        image=img,
+        mask=mask,
+        processor=processor,
+        prompt_number=prompt_number,
+        visual_prompt_tokens=prompt_tokens,
+        data_dtype=torch.bfloat16,
+    )
+    data_sample = dataset[0]
+    # Generate the caption
+    with torch.no_grad():
+        generate_ids = model.generate(
+            **data_sample,
+            generation_config=GenerationConfig(
+                max_new_tokens=1024,
+                # do_sample=False,
+                eos_token_id=processor.tokenizer.eos_token_id,
+                pad_token_id=processor.tokenizer.pad_token_id,
+            ),
+            return_dict=True,
+        )
+    output_caption = processor.tokenizer.decode(generate_ids.sequences[0], skip_special_tokens=True).strip()
+    return output_caption
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Describe Anything gradio demo")
+    parser.add_argument("--server_addr", "--host", type=str, default=None, help="The server address to listen on.")
+    parser.add_argument("--server_port", "--port", type=int, default=None, help="The port to listen on.")
+    parser.add_argument("--model-path", type=str, default="HaochenWang/GAR-1B", help="Path to the model checkpoint")
+    parser.add_argument("--prompt-mode", type=str, default="full+focal_crop", help="Prompt mode")
+    parser.add_argument("--conv-mode", type=str, default="v1", help="Conversation mode")
+    parser.add_argument("--temperature", type=float, default=0.2, help="Sampling temperature")
+    parser.add_argument("--top_p", type=float, default=0.5, help="Top-p for sampling")
+    args = parser.parse_args()
+    # Create Gradio interface
+    with gr.Blocks() as demo:
+        gr.Interface(
+            fn=image_to_sam_embedding,
+            inputs=gr.Textbox(label="Image Base64"),
+            outputs=gr.Textbox(label="Embedding Base64"),
+            title="Image Embedding Generator",
+            api_name="image_to_sam_embedding"
+        )
+        gr.Interface(
+            fn=describe,
+            inputs=[
+                gr.Textbox(label="Image Base64"),
+                gr.Text(label="Mask Base64"),
+                gr.Text(label="Prompt")
+            ],
+            outputs=[
+                gr.Text(label="Description")
+            ],
+            title="Mask Description Generator",
+            api_name="describe"
+        )
+        gr.Interface(
+            fn=describe_without_streaming,
+            inputs=[
+                gr.Textbox(label="Image Base64"),
+                gr.Text(label="Mask Base64"),
+                gr.Text(label="Prompt")
+            ],
+            outputs=[
+                gr.Text(label="Description")
+            ],
+            title="Mask Description Generator (Non-Streaming)",
+            api_name="describe_without_streaming"
+        )
+    demo._block_thread = demo.block_thread
+    demo.block_thread = lambda: None
+    demo.launch(
+        share=True,
+        server_name=args.server_addr,
+        server_port=args.server_port,
+        ssr_mode=False,
+    )
+    for route in demo.app.routes:
+        if route.path == "/":
+            demo.app.routes.remove(route)
+    demo.app.mount("/", StaticFiles(directory="dist", html=True), name="demo")
+    demo._block_thread()

eval_dataset.py ADDED Viewed

	@@ -0,0 +1,283 @@

+# --------------------------------------------------------
+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License")
+# Grasp Any Region Project
+# Written by Haochen Wang
+# --------------------------------------------------------
+import os
+import re
+from copy import deepcopy
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+class SingleRegionCaptionDataset(Dataset):
+    os.environ["TOKENIZERS_PARALLELISM"] = "true"
+    def __init__(
+        self,
+        image,
+        mask,
+        processor,
+        prompt_token="<Prompt1>",
+        prompt_number=5,
+        visual_prompt_tokens=[
+            "<Prompt0>",
+            "<Prompt1>",
+            "<Prompt2>",
+            "<Prompt3>",
+            "<Prompt4>",
+            "<NO_Prompt>",
+        ],
+        data_dtype=torch.bfloat16,
+        **kwargs,
+    ):
+        self.processor = processor
+        self.prompt_token = prompt_token
+        self.prompt_number = prompt_number
+        self.special_tokens = visual_prompt_tokens
+        self.visual_prompt_ids = {
+            token: self.processor.tokenizer.convert_tokens_to_ids(token) - 128256
+            for token in self.special_tokens
+        }
+        self.image = image
+        self.mask = mask
+        self.data_dtype = data_dtype
+    def __len__(self):
+        return len(self.coco.anns)
+    def _parse_annotations(self):
+        image = self.image
+        mask = self.mask  # binary mask
+        np.array(image)
+        mask_np = mask.astype(np.uint8)
+        filled_matrix = -1 * np.ones((image.height, image.width), dtype=np.uint8)
+        prompt_token = self.prompt_token
+        prompt_id = self.visual_prompt_ids.get(
+            prompt_token, self.visual_prompt_ids["<NO_Prompt>"]
+        )
+        assert prompt_id < 16, f"prompt_id should be less than {16}, got {prompt_id}"
+        fill_area = (filled_matrix == -1) & mask_np.astype(bool)
+        filled_matrix[fill_area] = prompt_id
+        filled_matrix[filled_matrix == -1] = self.visual_prompt_ids["<NO_Prompt>"]
+        bboxes = {}
+        prompt_idx = int(re.match(r"<Prompt(\d+)>", prompt_token).group(1))
+        non_zero_coords = np.argwhere(mask_np)
+        y_min, x_min = non_zero_coords.min(axis=0)
+        y_max, x_max = non_zero_coords.max(axis=0)
+        bbox = (
+            x_min / image.width,
+            y_min / image.height,
+            x_max / image.width,
+            y_max / image.height,
+        )
+        bboxes[
+            str(
+                self.processor.tokenizer.convert_tokens_to_ids(
+                    f"<|reserved_special_token_{prompt_idx + 2}|>"
+                )
+            )
+        ] = bbox
+        data_dict = {
+            "image": image,
+            "visual_prompt": Image.fromarray(filled_matrix),
+            "bboxes": bboxes,
+        }
+        return data_dict
+    def __getitem__(self, index):
+        data_dict = deepcopy(self._parse_annotations())
+        image = data_dict["image"]
+        visual_prompt = data_dict["visual_prompt"]
+        prompt_idx = int(re.match(r"<Prompt(\d+)>", self.prompt_token).group(1))
+        # <|reserved_special_token_{idx}|> actually starts from 2
+        qs = f"There are some objects I am curious about: {self.prompt_token};\n{self.prompt_token}: <|reserved_special_token_{prompt_idx + 2}|>Describe this masked region in detail."
+        qs = qs.replace(
+            f"<|reserved_special_token_{prompt_idx + 2}|>",
+            f"<|reserved_special_token_{prompt_idx + 2}|>" * 256,
+        )
+        user_content = [{"type": "image", "image": image}, {"type": "text", "text": qs}]
+        messages = [
+            {"role": "user", "content": user_content},
+        ]
+        # Prepare input for model
+        raw_prompt = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        model_inputs = self.processor(text=[raw_prompt], images=[image], visual_prompts=[visual_prompt], return_tensors="pt")
+        pixel_values = model_inputs["pixel_values"]
+        mask_values = model_inputs["mask_values"]
+        input_ids = model_inputs["input_ids"].squeeze(0)
+        attention_mask = model_inputs["attention_mask"].squeeze(0)
+        aspect_ratio = model_inputs["aspect_ratio"]
+        ret = dict(
+            input_ids=input_ids.cuda().unsqueeze(0),
+            attention_mask=attention_mask.cuda().to(self.data_dtype).unsqueeze(0),
+            pixel_values=pixel_values.cuda().to(self.data_dtype).flatten(0, 1),
+            global_mask_values=mask_values.cuda().to(self.data_dtype).squeeze(),
+            bboxes=[data_dict["bboxes"]],
+            aspect_ratios=aspect_ratio.unsqueeze(0).cuda(),
+        )
+        return ret
+class MultiRegionDataset(Dataset):
+    os.environ["TOKENIZERS_PARALLELISM"] = "true"
+    def __init__(
+        self,
+        image,
+        masks,
+        question_str,
+        processor,
+        prompt_token="<Prompt1>",
+        prompt_number=5,
+        visual_prompt_tokens=[
+            "<Prompt0>",
+            "<Prompt1>",
+            "<Prompt2>",
+            "<Prompt3>",
+            "<Prompt4>",
+            "<NO_Prompt>",
+        ],
+        data_dtype=torch.bfloat16,
+        **kwargs,
+    ):
+        self.processor = processor
+        self.prompt_token = prompt_token
+        self.prompt_number = prompt_number
+        self.special_tokens = visual_prompt_tokens
+        self.visual_prompt_ids = {
+            token: self.processor.tokenizer.convert_tokens_to_ids(token) - 128256
+            for token in self.special_tokens
+        }
+        self.image = image
+        self.masks = masks
+        self.question_str = question_str
+        self.data_dtype = data_dtype
+    def __len__(self):
+        return len(self.coco.anns)
+    def _parse_annotations(self):
+        image = self.image
+        masks = self.masks  # binary mask
+        width, height = image.size
+        np.array(image)
+        masks_np = [np.array(mask).astype(np.uint8) for mask in masks]
+        for mask_id, mask in enumerate(masks_np):
+            if image.width != mask.shape[1] or image.height != mask.shape[0]:
+                mask = mask.resize(image.size, Image.NEAREST)
+                masks[mask_id] = mask
+                masks_np[mask_id] = np.array(mask).astype(np.unint8)
+        prompt_matches = set(re.findall(r'<Prompt\d+>', self.question_str))
+        assert len(prompt_matches) == len(masks)
+        objects_desc = "There are some objects I am curious about: "
+        sub_image_desc = ""
+        for matched_prompt in prompt_matches:
+            objects_desc += f"{matched_prompt}; "
+            prompt_idx = int(re.match(r'<Prompt(\d+)>', matched_prompt).group(1))
+            sub_image_desc += f"{matched_prompt}: <|reserved_special_token_{prompt_idx + 2}|>\n"
+            sub_image_desc = sub_image_desc.replace(f"<|reserved_special_token_{prompt_idx + 2}|>", f"<|reserved_special_token_{prompt_idx + 2}|>" * 256)
+        prompt = objects_desc + "\n" + sub_image_desc + "\n" + self.question_str
+        filled_matrix = -1 * np.ones((image.height, image.width), dtype=np.uint8)
+        bboxes = {}
+        for matched_prompt in prompt_matches:
+            prompt_idx = int(re.match(r'<Prompt(\d+)>', matched_prompt).group(1))
+            mask = masks[prompt_idx]
+            prompt_token = matched_prompt
+            prompt_id = self.visual_prompt_ids.get(prompt_token, self.visual_prompt_ids["<NO_Prompt>"])
+            assert prompt_id < self.prompt_number + 1, f"prompt_id should be less than {self.prompt_numbers + 1}, got {prompt_id}"
+            fill_area = (filled_matrix == -1) & mask.astype(bool)
+            filled_matrix[fill_area] = prompt_id
+            non_zero_coords = np.argwhere(masks_np[mask_id])
+            y_min, x_min = non_zero_coords.min(axis=0)
+            y_max, x_max = non_zero_coords.max(axis=0)
+            bbox = (x_min / image.width, y_min / image.height, x_max / image.width, y_max / image.height)
+            bboxes[str(self.processor.tokenizer.convert_tokens_to_ids(f"<|reserved_special_token_{prompt_idx + 2}|>"))] = bbox
+        filled_matrix[filled_matrix == -1] = self.visual_prompt_ids["<NO_Prompt>"]
+        # convert masks to PIL.Image
+        masks = [Image.fromarray((masks_np[i] * 255).astype(np.uint8)) for i in range(len(masks))]
+        data_dict = {
+            'image': image,
+            'visual_prompt': Image.fromarray(filled_matrix),
+            'bboxes': bboxes,
+            'prompt': prompt,
+        }
+        return data_dict
+    def __getitem__(self, index):
+        data_dict = self._parse_annotations()
+        image = data_dict["image"]
+        visual_prompt = data_dict["visual_prompt"]
+        qs = data_dict["prompt"]
+        user_content = [
+            {"type": "image", "image": image},
+            {"type": "text", "text": qs}
+        ]
+        messages = [
+            {"role": "user", "content": user_content},
+        ]
+        # Prepare input for model
+        raw_prompt = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        model_inputs = self.processor(text=[raw_prompt], images=[image], visual_prompts=[visual_prompt], return_tensors="pt")
+        pixel_values = model_inputs["pixel_values"]
+        mask_values = model_inputs["mask_values"]
+        input_ids = model_inputs["input_ids"].squeeze(0)
+        attention_mask = model_inputs["attention_mask"].squeeze(0)
+        aspect_ratio = model_inputs["aspect_ratio"]
+        ret = dict(
+            input_ids=input_ids.cuda().unsqueeze(0),
+            attention_mask=attention_mask.cuda().to(self.data_dtype).unsqueeze(0),
+            pixel_values=pixel_values.cuda().to(self.data_dtype).flatten(0, 1),
+            global_mask_values=mask_values.cuda().to(self.data_dtype).squeeze(),
+            bboxes=[data_dict["bboxes"]],
+            aspect_ratios=aspect_ratio.unsqueeze(0).cuda(),
+        )
+        return ret

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+sentencepiece
+accelerate>=0.28.0
+pydantic>=2.10.1
+numpy>=1.23.5,<2.0.0
+pillow>=9.4.0
+gradio>=5.5.0
+requests
+httpx
+uvicorn
+fastapi
+protobuf
+opencv-python
+openai>=1.55.0
+spaces==0.30.4
+git+https://github.com/facebookresearch/segment-anything.git
+torch
+torchvision

temp.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import gradio as gr
+def greet(name):
+    return "Hello " + name + "!!"
+demo = gr.Interface(fn=greet, inputs="text", outputs="text")
+demo.launch()