Spaces:

ankandrew
/

Qwen2.5VL

Running on Zero

App Files Files Community

ankandrew commited on Apr 18

Commit

b3d5d95

1 Parent(s): c5c055b

Tweaks

Browse files

Files changed (1) hide show

app.py +46 -8

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ MODEL_NAMES = {
 @spaces.GPU(duration=300)
-def run_inference(model_key, input_type, text, image, video, fps):
     """
     Load the selected Qwen2.5-VL model and run inference on text, image, or video.
     """
@@ -54,11 +54,17 @@ def run_inference(model_key, input_type, text, image, video, fps):
         video_src = video if str(video).startswith("file://") else f"file://{video}"
         content.append({"type": "video", "video": video_src, "fps": fps})
     content.append({"type": "text", "text": text or ""})
-    msg = [{"role": "user", "content": content}]
     # Prepare inputs for model with video kwargs
     text_prompt = processor.apply_chat_template(
-        msg, tokenize=False, add_generation_prompt=True
     )
     image_inputs, video_inputs, video_kwargs = process_vision_info(msg, return_video_kwargs=True)
     inputs = processor(
@@ -83,16 +89,39 @@ with demo:
     gr.Markdown("# Qwen2.5-VL Multimodal Demo")
     model_select = gr.Dropdown(list(MODEL_NAMES.keys()), label="Select Model")
     input_type = gr.Radio(["text", "image", "video"], label="Input Type")
-    text_input = gr.Textbox(lines=3, placeholder="Enter text...", visible=True)
-    image_input = gr.Image(type="filepath", visible=False)
     video_input = gr.Video(visible=False)
-    fps_input = gr.Slider(minimum=0.1, maximum=30.0, step=0.1, value=2.0, label="FPS", visible=False)
     output = gr.Textbox(label="Output")
     # Show/hide inputs based on selection
     def update_inputs(choice):
         return (
-            gr.update(visible=(choice == "text")),
             gr.update(visible=(choice == "image")),
             gr.update(visible=(choice == "video")),
             gr.update(visible=(choice == "video"))
@@ -102,7 +131,16 @@ with demo:
     run_btn = gr.Button("Generate")
     run_btn.click(
         run_inference,
-        [model_select, input_type, text_input, image_input, video_input, fps_input],
         output
     )

 @spaces.GPU(duration=300)
+def run_inference(model_key, input_type, text, image, video, fps, system_prompt, add_vision_id):
     """
     Load the selected Qwen2.5-VL model and run inference on text, image, or video.
     """
         video_src = video if str(video).startswith("file://") else f"file://{video}"
         content.append({"type": "video", "video": video_src, "fps": fps})
     content.append({"type": "text", "text": text or ""})
+    msg = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user",   "content": content}
+    ]
     # Prepare inputs for model with video kwargs
     text_prompt = processor.apply_chat_template(
+        msg,
+        tokenize=False,
+        add_generation_prompt=True,
+        add_vision_id=add_vision_id
     )
     image_inputs, video_inputs, video_kwargs = process_vision_info(msg, return_video_kwargs=True)
     inputs = processor(
     gr.Markdown("# Qwen2.5-VL Multimodal Demo")
     model_select = gr.Dropdown(list(MODEL_NAMES.keys()), label="Select Model")
     input_type = gr.Radio(["text", "image", "video"], label="Input Type")
+    system_prompt_input = gr.Textbox(
+        lines=2,
+        placeholder="System prompt…",
+        value="You are a helpful assistant.",
+        label="System Prompt"
+   )
+    vision_id_checkbox = gr.Checkbox(
+        label="Add vision ID",
+        value=False
+   )
+    text_input = gr.Textbox(
+        lines=3,
+        placeholder="Enter text ...",
+        visible=True
+    )
+    image_input = gr.File(
+        file_count="multiple",
+        file_types=["image"],
+        label="Upload Images",
+        visible=False
+    )
     video_input = gr.Video(visible=False)
+    fps_input = gr.Number(
+      value=2.0,
+      label="FPS",
+      visible=False
+    )
     output = gr.Textbox(label="Output")
     # Show/hide inputs based on selection
     def update_inputs(choice):
         return (
+            gr.update(visible=True),
             gr.update(visible=(choice == "image")),
             gr.update(visible=(choice == "video")),
             gr.update(visible=(choice == "video"))
     run_btn = gr.Button("Generate")
     run_btn.click(
         run_inference,
+        [
+                model_select,
+                input_type,
+                text_input,
+                image_input,
+                video_input,
+                fps_input,
+                system_prompt_input,
+                vision_id_checkbox
+            ],
         output
     )