Spaces:

nvidia
/

ChronoEdit

Running on Zero

App Files Files Community

zhangjiewu commited on 14 days ago

Commit

6d75145

1 Parent(s): 3d1918c

update prompt enhancer

Browse files

Files changed (2) hide show

app.py +4 -24
prompt_enhancer.py +14 -4

app.py CHANGED Viewed

@@ -82,11 +82,12 @@ if lora_path:
 	)
 	print(f"✓ Configured scheduler (flow_shift=2.0)")
 end = time.time()
 print(f"Model loaded in {end - start:.2f}s.")
 start = time.time()
-prompt_enhancer_model = "Qwen/Qwen3-VL-30B-A3B-Instruct"
 prompt_model, processor = load_model(prompt_enhancer_model)
 end = time.time()
 print(f"Prompt enhancer loaded in {end - start:.2f}s.")
@@ -124,12 +125,6 @@ def run_inference(
 	shift: float = 2.0,
 	num_temporal_reasoning_steps: int = 8,
 ):
-	start = time.time()
-	prompt_model.to(DEVICE)
-	end = time.time()
-	print(f"Pipeline moved to device in {end - start:.2f}s")
 	# Rewriter
 	final_prompt = prompt
@@ -152,11 +147,6 @@ def run_inference(
 	print("=" * 80 + "\n")
 	final_prompt = cot_prompt
-	start = time.time()
-	prompt_model.to("cpu")
-	end = time.time()
-	print(f"Pipeline moved to cpu in {end - start:.2f}s")
 	# Inference
 	print(f"Loading input image: {image_path}")
 	image = load_image(image_path)
@@ -169,11 +159,6 @@ def run_inference(
 	image = image.resize((width, height))
 	num_frames = 29 if enable_temporal_reasoning else 5
-	start = time.time()
-	pipe.to(DEVICE)
-	end = time.time()
-	print(f"Pipeline moved to device in {end - start:.2f}s")
 	start = time.time()
 	output = pipe(
 		image=image,
@@ -189,11 +174,6 @@ def run_inference(
 	end = time.time()
 	print(f"Generated video in {end - start:.2f}s")
-	start = time.time()
-	pipe.to("cpu")
-	end = time.time()
-	print(f"Pipeline moved to cpu in {end - start:.2f}s")
 	video_tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
 	output_path_video = video_tmp.name
 	video_tmp.close()
@@ -268,7 +248,7 @@ def build_ui() -> gr.Blocks:
 				],
 				[
 					"examples/2.png",
-					"The user wants to change the scene so that the girl in the traditional-style painting, wearing her ornate floral robe and headdress, is now playing a guitar.",
 					False,
 				],
 				[
@@ -291,4 +271,4 @@ def build_ui() -> gr.Blocks:
 if __name__ == "__main__":
 	demo = build_ui()
 	# demo.launch(server_name="0.0.0.0", server_port=7869)
-	demo.queue().launch()

 	)
 	print(f"✓ Configured scheduler (flow_shift=2.0)")
+pipe.to(DEVICE)
 end = time.time()
 print(f"Model loaded in {end - start:.2f}s.")
 start = time.time()
+prompt_enhancer_model = "Qwen/Qwen3-VL-8B-Instruct"
 prompt_model, processor = load_model(prompt_enhancer_model)
 end = time.time()
 print(f"Prompt enhancer loaded in {end - start:.2f}s.")
 	shift: float = 2.0,
 	num_temporal_reasoning_steps: int = 8,
 ):
 	# Rewriter
 	final_prompt = prompt
 	print("=" * 80 + "\n")
 	final_prompt = cot_prompt
 	# Inference
 	print(f"Loading input image: {image_path}")
 	image = load_image(image_path)
 	image = image.resize((width, height))
 	num_frames = 29 if enable_temporal_reasoning else 5
 	start = time.time()
 	output = pipe(
 		image=image,
 	end = time.time()
 	print(f"Generated video in {end - start:.2f}s")
 	video_tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
 	output_path_video = video_tmp.name
 	video_tmp.close()
 				],
 				[
 					"examples/2.png",
+					"The user wants to change the scene so that the girl in the traditional-style painting, wearing her ornate floral robe and headdress, is now playing a guitar. Her graceful appearance remains unchanged - smooth black hair tied neatly, soft facial features with a calm, focused expression - but her pose shifts: both hands are engaged with the guitar. One hand rests on the neck of the instrument, fingers pressing the strings with delicate precision, while the other hand strums near the sound hole. The guitar is positioned naturally across her lap, blending with the elegance of her posture. The traditional painting style is preserved, but the addition of the guitar introduces a modern contrast, giving the scene a harmonious fusion of classical refinement and contemporary music.",
 					False,
 				],
 				[
 if __name__ == "__main__":
 	demo = build_ui()
 	# demo.launch(server_name="0.0.0.0", server_port=7869)
+	demo.queue().launch(share=True)

prompt_enhancer.py CHANGED Viewed

@@ -17,8 +17,9 @@ import argparse
 import torch
 from PIL import Image
 from transformers import (
-    Qwen2_5_VLForConditionalGeneration,
-    AutoProcessor,
     Qwen3VLMoeForConditionalGeneration,
 )
 from qwen_vl_utils import process_vision_info
@@ -108,13 +109,22 @@ def load_model(model_name):
         )
         processor = AutoProcessor.from_pretrained(model_name)
     else:
         raise ValueError(f"Unsupported model: {model_name}")
     return model, processor
-def resize_if_needed(image, max_resolution=1080):
     """Resize image so that the shortest edge is at most max_resolution pixels."""
     width, height = image.size
     if min(width, height) > max_resolution:
@@ -152,7 +162,7 @@ def _run_model_inference(messages, model, processor):
         inputs = inputs.to(model.device).to(model.dtype)
         generated_ids = model.generate(**inputs, max_new_tokens=512)
-    elif isinstance(model, Qwen3VLMoeForConditionalGeneration):
         inputs = processor.apply_chat_template(
             messages,
             tokenize=True,

 import torch
 from PIL import Image
 from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    AutoProcessor,
+    Qwen3VLForConditionalGeneration,
     Qwen3VLMoeForConditionalGeneration,
 )
 from qwen_vl_utils import process_vision_info
         )
         processor = AutoProcessor.from_pretrained(model_name)
+    elif model_name == "Qwen/Qwen3-VL-8B-Instruct":
+        model = Qwen3VLForConditionalGeneration.from_pretrained(
+            model_name,
+            dtype=torch.bfloat16,
+            attn_implementation=attn_impl,
+            device_map="auto"
+        )
+        processor = AutoProcessor.from_pretrained(model_name)
     else:
         raise ValueError(f"Unsupported model: {model_name}")
     return model, processor
+def resize_if_needed(image, max_resolution):
     """Resize image so that the shortest edge is at most max_resolution pixels."""
     width, height = image.size
     if min(width, height) > max_resolution:
         inputs = inputs.to(model.device).to(model.dtype)
         generated_ids = model.generate(**inputs, max_new_tokens=512)
+    elif isinstance(model, Qwen3VLMoeForConditionalGeneration) or isinstance(model, Qwen3VLForConditionalGeneration):
         inputs = processor.apply_chat_template(
             messages,
             tokenize=True,