morthens
/

qwen2-vl-inference

@@ -1,81 +1,62 @@
-from typing import Dict, List, Any
-import json
 import torch
 from PIL import Image
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 class EndpointHandler:
-    def __init__(self, model_name: str ="morthens/qwen2-vl-inference"):
-        # Load the model and processor
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
-            model_name,
             torch_dtype="auto",
             device_map="auto"
         )
-        self.processor = AutoProcessor.from_pretrained(model_name)
-    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
-        # Extract image path and messages from the request data
-        image_path = data.get("image_path", "")
-        messages = data.get("messages", [])
-        # Load the image
         try:
-            image = Image.open(image_path)
-        except FileNotFoundError:
-            return [{"error": "Image file not found."}]
         except Exception as e:
-            return [{"error": str(e)}]
-        # Prepare the text prompt from messages
-        text_prompt = self.create_text_prompt(messages)
-        # Process inputs for the model
         inputs = self.processor(
-            text=[text_prompt],
             images=[image],
             padding=True,
             return_tensors="pt"
         )
-        # Move inputs to GPU if available
-        inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
-        # Generate output using the model
-        output_ids = self.model.generate(**inputs, max_new_tokens=128)
-        # Decode the generated output
-        generated_ids = [
-            output_ids[len(input_ids):]
-            for input_ids, output_ids in zip(inputs.input_ids, output_ids)
-        ]
         output_text = self.processor.batch_decode(
-            generated_ids,
             skip_special_tokens=True,
             clean_up_tokenization_spaces=True
-        )
-        # Clean and parse JSON from output text
-        cleaned_data = self.clean_output(output_text[0])
-        try:
-            json_data = json.loads(cleaned_data)
-        except json.JSONDecodeError:
-            return [{"error": "Failed to parse JSON output."}]
-        return [json_data]
-    def create_text_prompt(self, messages: List[Dict[str, Any]]) -> str:
-        """Extracts and formats text content from messages."""
-        text_content = ""
-        for message in messages:
-            for content in message.get('content', []):
-                if content['type'] == 'text':
-                    text_content += content['text']
-        return self.processor.apply_chat_template(messages, add_generation_prompt=True)
-    def clean_output(self, output: str) -> str:
-        """Cleans up the model's output for JSON parsing."""
-        return output.replace("```json\n", "").replace("```", "").strip()

+from typing import Dict, Any
 import torch
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from PIL import Image
+import requests
+from io import BytesIO
+# Check for GPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 class EndpointHandler:
+    def __init__(self, path: str = "morthens/qwen2-vl-inference"):
+        # Load the processor and model
+        self.processor = AutoProcessor.from_pretrained(path)
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+            path,
             torch_dtype="auto",
             device_map="auto"
         )
+        # Move the model to the appropriate device
+        self.model.to(device)
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        # Extract the input data
+        image_url = data.get("image_url", "")
+        text = data.get("text", "")
+        # Load the image from the URL
         try:
+            response = requests.get(image_url)
+            response.raise_for_status()
+            image = Image.open(BytesIO(response.content))
         except Exception as e:
+            return {"error": f"Failed to fetch or process image: {str(e)}"}
+        # Preprocess the input
         inputs = self.processor(
+            text=[text],
             images=[image],
             padding=True,
             return_tensors="pt"
         )
+        # Move inputs to the correct device
+        inputs = {key: value.to(device) for key, value in inputs.items()}
+        # Perform inference
+        output_ids = self.model.generate(
+            **inputs,
+            max_new_tokens=128
+        )
+        # Decode the output
         output_text = self.processor.batch_decode(
+            output_ids,
             skip_special_tokens=True,
             clean_up_tokenization_spaces=True
+        )[0]
+        # Return the raw prediction
+        return {"prediction": output_text}