linoyts HF Staff commited on
Commit
9fb37c1
·
verified ·
1 Parent(s): a487b94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -403
app.py CHANGED
@@ -28,354 +28,6 @@ from PIL import Image
28
  import os
29
  import gradio as gr
30
 
31
- def turn_into_video(input_images, output_images, prompt, progress=gr.Progress(track_tqdm=True)):
32
- if not input_images or not output_images:
33
- raise gr.Error("Please generate an output image first.")
34
-
35
- progress(0.02, desc="Preparing images...")
36
-
37
- def extract_pil(img_entry):
38
- if isinstance(img_entry, tuple) and isinstance(img_entry[0], Image.Image):
39
- return img_entry[0]
40
- elif isinstance(img_entry, Image.Image):
41
- return img_entry
42
- elif isinstance(img_entry, str):
43
- return Image.open(img_entry)
44
- else:
45
- raise gr.Error(f"Unsupported image format: {type(img_entry)}")
46
-
47
- start_img = extract_pil(input_images[0])
48
- end_img = extract_pil(output_images[0])
49
-
50
- progress(0.10, desc="Saving temp files...")
51
-
52
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_start, \
53
- tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_end:
54
- start_img.save(tmp_start.name)
55
- end_img.save(tmp_end.name)
56
-
57
- progress(0.20, desc="Connecting to Wan space...")
58
-
59
- client = Client("multimodalart/wan-2-2-first-last-frame")
60
-
61
- progress(0.35, desc="Generating video...")
62
-
63
- video_path, seed = client.predict(
64
- start_image_pil=handle_file(tmp_start.name),
65
- end_image_pil=handle_file(tmp_end.name),
66
- prompt=prompt or "smooth cinematic transition",
67
- api_name="/generate_video"
68
- )
69
-
70
- progress(0.95, desc="Finalizing...")
71
- print(video_path)
72
- return video_path['video']
73
-
74
-
75
-
76
-
77
- SYSTEM_PROMPT = '''
78
- # Edit Instruction Rewriter
79
- You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited.
80
- Please strictly follow the rewriting rules below:
81
- ## 1. General Principles
82
- - Keep the rewritten prompt **concise and comprehensive**. Avoid overly long sentences and unnecessary descriptive language.
83
- - If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary.
84
- - Keep the main part of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility.
85
- - All added objects or modifications must align with the logic and style of the scene in the input images.
86
- - If multiple sub-images are to be generated, describe the content of each sub-image individually.
87
- ## 2. Task-Type Handling Rules
88
- ### 1. Add, Delete, Replace Tasks
89
- - If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar.
90
- - If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example:
91
- > Original: "Add an animal"
92
- > Rewritten: "Add a light-gray cat in the bottom-right corner, sitting and facing the camera"
93
- - Remove meaningless instructions: e.g., "Add 0 objects" should be ignored or flagged as invalid.
94
- - For replacement tasks, specify "Replace Y with X" and briefly describe the key visual features of X.
95
- ### 2. Text Editing Tasks
96
- - All text content must be enclosed in English double quotes `" "`. Keep the original language of the text, and keep the capitalization.
97
- - Both adding new text and replacing existing text are text replacement tasks, For example:
98
- - Replace "xx" to "yy"
99
- - Replace the mask / bounding box to "yy"
100
- - Replace the visual object to "yy"
101
- - Specify text position, color, and layout only if user has required.
102
- - If font is specified, keep the original language of the font.
103
- ### 3. Human Editing Tasks
104
- - Make the smallest changes to the given user's prompt.
105
- - If changes to background, action, expression, camera shot, or ambient lighting are required, please list each modification individually.
106
- - **Edits to makeup or facial features / expression must be subtle, not exaggerated, and must preserve the subject's identity consistency.**
107
- > Original: "Add eyebrows to the face"
108
- > Rewritten: "Slightly thicken the person's eyebrows with little change, look natural."
109
- ### 4. Style Conversion or Enhancement Tasks
110
- - If a style is specified, describe it concisely using key visual features. For example:
111
- > Original: "Disco style"
112
- > Rewritten: "1970s disco style: flashing lights, disco ball, mirrored walls, vibrant colors"
113
- - For style reference, analyze the original image and extract key characteristics (color, composition, texture, lighting, artistic style, etc.), integrating them into the instruction.
114
- - **Colorization tasks (including old photo restoration) must use the fixed template:**
115
- "Restore and colorize the old photo."
116
- - Clearly specify the object to be modified. For example:
117
- > Original: Modify the subject in Picture 1 to match the style of Picture 2.
118
- > Rewritten: Change the girl in Picture 1 to the ink-wash style of Picture 2 — rendered in black-and-white watercolor with soft color transitions.
119
- ### 5. Material Replacement
120
- - Clearly specify the object and the material. For example: "Change the material of the apple to papercut style."
121
- - For text material replacement, use the fixed template:
122
- "Change the material of text "xxxx" to laser style"
123
- ### 6. Logo/Pattern Editing
124
- - Material replacement should preserve the original shape and structure as much as possible. For example:
125
- > Original: "Convert to sapphire material"
126
- > Rewritten: "Convert the main subject in the image to sapphire material, preserving similar shape and structure"
127
- - When migrating logos/patterns to new scenes, ensure shape and structure consistency. For example:
128
- > Original: "Migrate the logo in the image to a new scene"
129
- > Rewritten: "Migrate the logo in the image to a new scene, preserving similar shape and structure"
130
- ### 7. Multi-Image Tasks
131
- - Rewritten prompts must clearly point out which image's element is being modified. For example:
132
- > Original: "Replace the subject of picture 1 with the subject of picture 2"
133
- > Rewritten: "Replace the girl of picture 1 with the boy of picture 2, keeping picture 2's background unchanged"
134
- - For stylization tasks, describe the reference image's style in the rewritten prompt, while preserving the visual content of the source image.
135
- ## 3. Rationale and Logic Check
136
- - Resolve contradictory instructions: e.g., "Remove all trees but keep all trees" requires logical correction.
137
- - Supplement missing critical information: e.g., if position is unspecified, choose a reasonable area based on composition (near subject, blank space, center/edge, etc.).
138
- # Output Format Example
139
- ```json
140
- {
141
- "Rewritten": "..."
142
- }
143
- '''
144
-
145
-
146
- NEXT_SCENE_SYSTEM_PROMPT = '''
147
- # Next Scene Prompt Generator
148
- You are a cinematic AI director assistant. Your task is to analyze the provided image and generate a compelling "Next Scene" prompt that describes the natural cinematic progression from the current frame.
149
- ## Core Principles:
150
- - Think like a film director: Consider camera dynamics, visual composition, and narrative continuity
151
- - Create prompts that flow seamlessly from the current frame
152
- - Focus on **visual progression** rather than static modifications
153
- - Maintain compositional coherence while introducing organic transitions
154
- ## Prompt Structure:
155
- Always begin with "Next Scene: " followed by your cinematic description.
156
- ## Key Elements to Include:
157
- 1. **Camera Movement**: Specify one of these or combinations:
158
- - Dolly shots (camera moves toward/away from subject)
159
- - Push-ins or pull-backs
160
- - Tracking moves (camera follows subject)
161
- - Pan left/right
162
- - Tilt up/down
163
- - Zoom in/out
164
- 2. **Framing Evolution**: Describe how the shot composition changes:
165
- - Wide to close-up transitions
166
- - Angle shifts (high angle to eye level, etc.)
167
- - Reframing of subjects
168
- - Revealing new elements in frame
169
- 3. **Environmental Reveals** (if applicable):
170
- - New characters entering frame
171
- - Expanded scenery
172
- - Spatial progression
173
- - Background elements becoming visible
174
- 4. **Atmospheric Shifts** (if enhancing the scene):
175
- - Lighting changes (golden hour, shadows, lens flare)
176
- - Weather evolution
177
- - Time-of-day transitions
178
- - Depth and mood indicators
179
- ## Guidelines:
180
- - Keep descriptions concise but vivid (2-3 sentences max)
181
- - Always specify the camera action first
182
- - Focus on what changes between this frame and the next
183
- - Maintain the scene's existing style and mood unless intentionally transitioning
184
- - Prefer natural, organic progressions over abrupt changes
185
- ## Example Outputs:
186
- - "Next Scene: The camera pulls back from a tight close-up on the airship to a sweeping aerial view, revealing an entire fleet of vessels soaring through a fantasy landscape."
187
- - "Next Scene: The camera tracks forward and tilts down, bringing the sun and helicopters closer into frame as a strong lens flare intensifies."
188
- - "Next Scene: The camera pans right, removing the dragon and rider from view while revealing more of the floating mountain range in the distance."
189
- - "Next Scene: The camera moves slightly forward as sunlight breaks through the clouds, casting a soft glow around the character's silhouette in the mist. Realistic cinematic style, atmospheric depth."
190
- ## Output Format:
191
- Return ONLY the next scene prompt as plain text, starting with "Next Scene: "
192
- Do NOT include JSON formatting or additional explanations.
193
- '''
194
-
195
- # --- Prompt Enhancement using Hugging Face InferenceClient ---
196
- def polish_prompt_hf(original_prompt, img_list):
197
- """
198
- Rewrites the prompt using a Hugging Face InferenceClient.
199
- """
200
- # Ensure HF_TOKEN is set
201
- api_key = os.environ.get("HF_TOKEN")
202
- if not api_key:
203
- print("Warning: HF_TOKEN not set. Falling back to original prompt.")
204
- return original_prompt
205
-
206
- try:
207
- # Initialize the client
208
- prompt = f"{SYSTEM_PROMPT}\n\nUser Input: {original_prompt}\n\nRewritten Prompt:"
209
- client = InferenceClient(
210
- provider="nebius",
211
- api_key=api_key,
212
- )
213
-
214
- # Format the messages for the chat completions API
215
- sys_promot = "you are a helpful assistant, you should provide useful answers to users."
216
- messages = [
217
- {"role": "system", "content": sys_promot},
218
- {"role": "user", "content": []}]
219
- for img in img_list:
220
- messages[1]["content"].append(
221
- {"image": f"data:image/png;base64,{encode_image(img)}"})
222
- messages[1]["content"].append({"text": f"{prompt}"})
223
-
224
- # Call the API
225
- completion = client.chat.completions.create(
226
- model="Qwen/Qwen2.5-VL-72B-Instruct",
227
- messages=messages,
228
- )
229
-
230
- # Parse the response
231
- result = completion.choices[0].message.content
232
-
233
- # Try to extract JSON if present
234
- if '"Rewritten"' in result:
235
- try:
236
- # Clean up the response
237
- result = result.replace('```json', '').replace('```', '')
238
- result_json = json.loads(result)
239
- polished_prompt = result_json.get('Rewritten', result)
240
- except:
241
- polished_prompt = result
242
- else:
243
- polished_prompt = result
244
-
245
- polished_prompt = polished_prompt.strip().replace("\n", " ")
246
- return polished_prompt
247
-
248
- except Exception as e:
249
- print(f"Error during API call to Hugging Face: {e}")
250
- # Fallback to original prompt if enhancement fails
251
- return original_prompt
252
-
253
- def next_scene_prompt(original_prompt, img_list):
254
- """
255
- Rewrites the prompt using a Hugging Face InferenceClient.
256
- Supports multiple images via img_list.
257
- """
258
- # Ensure HF_TOKEN is set
259
- api_key = os.environ.get("HF_TOKEN")
260
- if not api_key:
261
- print("Warning: HF_TOKEN not set. Falling back to original prompt.")
262
- return original_prompt
263
- prompt = f"{NEXT_SCENE_SYSTEM_PROMPT}"
264
- system_prompt = "you are a helpful assistant, you should provide useful answers to users."
265
- try:
266
- # Initialize the client
267
- client = InferenceClient(
268
- provider="nebius",
269
- api_key=api_key,
270
- )
271
-
272
- # Convert list of images to base64 data URLs
273
- image_urls = []
274
- if img_list is not None:
275
- # Ensure img_list is actually a list
276
- if not isinstance(img_list, list):
277
- img_list = [img_list]
278
-
279
- for img in img_list:
280
- image_url = None
281
- # If img is a PIL Image
282
- if hasattr(img, 'save'): # Check if it's a PIL Image
283
- buffered = BytesIO()
284
- img.save(buffered, format="PNG")
285
- img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
286
- image_url = f"data:image/png;base64,{img_base64}"
287
- # If img is already a file path (string)
288
- elif isinstance(img, str):
289
- with open(img, "rb") as image_file:
290
- img_base64 = base64.b64encode(image_file.read()).decode('utf-8')
291
- image_url = f"data:image/png;base64,{img_base64}"
292
- else:
293
- print(f"Warning: Unexpected image type: {type(img)}, skipping...")
294
- continue
295
-
296
- if image_url:
297
- image_urls.append(image_url)
298
-
299
- # Build the content array with text first, then all images
300
- content = [
301
- {
302
- "type": "text",
303
- "text": prompt
304
- }
305
- ]
306
-
307
- # Add all images to the content
308
- for image_url in image_urls:
309
- content.append({
310
- "type": "image_url",
311
- "image_url": {
312
- "url": image_url
313
- }
314
- })
315
-
316
- # Format the messages for the chat completions API
317
- messages = [
318
- {"role": "system", "content": system_prompt},
319
- {
320
- "role": "user",
321
- "content": content
322
- }
323
- ]
324
-
325
- # Call the API
326
- completion = client.chat.completions.create(
327
- model="Qwen/Qwen2.5-VL-72B-Instruct",
328
- messages=messages,
329
- )
330
-
331
- # Parse the response
332
- result = completion.choices[0].message.content
333
-
334
- # Try to extract JSON if present
335
- if '"Rewritten"' in result:
336
- try:
337
- # Clean up the response
338
- result = result.replace('```json', '').replace('```', '')
339
- result_json = json.loads(result)
340
- polished_prompt = result_json.get('Rewritten', result)
341
- except:
342
- polished_prompt = result
343
- else:
344
- polished_prompt = result
345
-
346
- polished_prompt = polished_prompt.strip().replace("\n", " ")
347
- return polished_prompt
348
-
349
- except Exception as e:
350
- print(f"Error during API call to Hugging Face: {e}")
351
- # Fallback to original prompt if enhancement fails
352
- return original_prompt
353
-
354
-
355
- def update_history(new_images, history):
356
- """Updates the history gallery with the new images."""
357
- time.sleep(0.5) # Small delay to ensure images are ready
358
- if history is None:
359
- history = []
360
- if new_images is not None and len(new_images) > 0:
361
- if not isinstance(history, list):
362
- history = list(history) if history else []
363
- for img in new_images:
364
- history.insert(0, img)
365
- history = history[:20] # Keep only last 20 images
366
- return history
367
-
368
- def use_history_as_input(evt: gr.SelectData):
369
- """Sets the selected history image as the new input image."""
370
- if evt.value is not None:
371
- return gr.update(value=[(evt.value,)])
372
- return gr.update()
373
-
374
- def encode_image(pil_image):
375
- import io
376
- buffered = io.BytesIO()
377
- pil_image.save(buffered, format="PNG")
378
- return base64.b64encode(buffered.getvalue()).decode("utf-8")
379
 
380
  # --- Model Loading ---
381
  dtype = torch.bfloat16
@@ -388,11 +40,11 @@ pipe = QwenImageEditPlusPipeline.from_pretrained("Qwen/Qwen-Image-Edit-2509",
388
  device_map='cuda'),torch_dtype=dtype).to(device)
389
 
390
  pipe.load_lora_weights(
391
- "lovis93/next-scene-qwen-image-lora-2509",
392
- weight_name="next-scene_lora-v2-3000.safetensors", adapter_name="next-scene"
393
  )
394
- pipe.set_adapters(["next-scene"], adapter_weights=[1.])
395
- pipe.fuse_lora(adapter_names=["next-scene"], lora_scale=1.)
396
  pipe.unload_lora_weights()
397
 
398
 
@@ -443,7 +95,6 @@ def infer(
443
  num_inference_steps=4,
444
  height=None,
445
  width=None,
446
- rewrite_prompt=True,
447
  num_images_per_prompt=1,
448
  progress=gr.Progress(track_tqdm=True),
449
  ):
@@ -478,9 +129,6 @@ def infer(
478
  print(f"Calling pipeline with prompt: '{prompt}'")
479
  print(f"Negative Prompt: '{negative_prompt}'")
480
  print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}, Size: {width}x{height}")
481
- if rewrite_prompt and len(pil_images) > 0:
482
- prompt = polish_prompt_hf(prompt, pil_images)
483
- print(f"Rewritten Prompt: {prompt}")
484
 
485
 
486
  # Generate the image
@@ -590,8 +238,6 @@ with gr.Blocks(css=css) as demo:
590
  value=None,
591
  )
592
 
593
-
594
- rewrite_prompt = gr.Checkbox(label="Rewrite prompt", value=False)
595
 
596
 
597
 
@@ -599,19 +245,8 @@ with gr.Blocks(css=css) as demo:
599
  result = gr.Gallery(label="Result", show_label=False, type="pil")
600
  with gr.Row():
601
  use_output_btn = gr.Button("↗️ Use as input", variant="secondary", size="sm", visible=False)
602
- turn_video_btn = gr.Button("🎬 Turn into Video", variant="secondary", size="sm", visible=False)
603
- output_video = gr.Video(label="Generated Video", autoplay=True, visible=False)
604
 
605
- with gr.Row(visible=False):
606
- gr.Markdown("### 📜 History")
607
- clear_history_button = gr.Button("🗑️ Clear History", size="sm", variant="stop")
608
 
609
- history_gallery = gr.Gallery(
610
- label="Click any image to use as input",
611
- interactive=False,
612
- show_label=True,
613
- visible=False
614
- )
615
 
616
  gr.Examples(examples=[
617
  [["disaster_girl.jpg", "grumpy.png"], "Next Scene: the camera zooms in, showing the cat walking away from the fire"],
@@ -637,14 +272,8 @@ with gr.Blocks(css=css) as demo:
637
  num_inference_steps,
638
  height,
639
  width,
640
- rewrite_prompt,
641
  ],
642
- outputs=[result, seed, use_output_btn, turn_video_btn],
643
-
644
- ).then(
645
- fn=update_history,
646
- inputs=[result, history_gallery],
647
- outputs=history_gallery,
648
 
649
  )
650
 
@@ -655,33 +284,6 @@ with gr.Blocks(css=css) as demo:
655
  outputs=[input_images]
656
  )
657
 
658
- # History gallery event handlers
659
- history_gallery.select(
660
- fn=use_history_as_input,
661
- inputs=None,
662
- outputs=[input_images],
663
-
664
- )
665
-
666
- clear_history_button.click(
667
- fn=lambda: [],
668
- inputs=None,
669
- outputs=history_gallery,
670
-
671
- )
672
-
673
- input_images.change(fn=suggest_next_scene_prompt, inputs=[input_images], outputs=[prompt])
674
-
675
- turn_video_btn.click(
676
- fn=lambda: gr.update(visible=True),
677
- inputs=None,
678
- outputs=[output_video],
679
- ).then(
680
- fn=turn_into_video,
681
- inputs=[input_images, result, prompt],
682
- outputs=[output_video],
683
- )
684
-
685
 
686
  if __name__ == "__main__":
687
  demo.launch()
 
28
  import os
29
  import gradio as gr
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # --- Model Loading ---
33
  dtype = torch.bfloat16
 
40
  device_map='cuda'),torch_dtype=dtype).to(device)
41
 
42
  pipe.load_lora_weights(
43
+ "dx8152/Qwen-Edit-2509-Multiple-angles",
44
+ weight_name="镜头转换.safetensors", adapter_name="angles"
45
  )
46
+ pipe.set_adapters(["angles"], adapter_weights=[1.])
47
+ pipe.fuse_lora(adapter_names=["angles"], lora_scale=1.)
48
  pipe.unload_lora_weights()
49
 
50
 
 
95
  num_inference_steps=4,
96
  height=None,
97
  width=None,
 
98
  num_images_per_prompt=1,
99
  progress=gr.Progress(track_tqdm=True),
100
  ):
 
129
  print(f"Calling pipeline with prompt: '{prompt}'")
130
  print(f"Negative Prompt: '{negative_prompt}'")
131
  print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}, Size: {width}x{height}")
 
 
 
132
 
133
 
134
  # Generate the image
 
238
  value=None,
239
  )
240
 
 
 
241
 
242
 
243
 
 
245
  result = gr.Gallery(label="Result", show_label=False, type="pil")
246
  with gr.Row():
247
  use_output_btn = gr.Button("↗️ Use as input", variant="secondary", size="sm", visible=False)
 
 
248
 
 
 
 
249
 
 
 
 
 
 
 
250
 
251
  gr.Examples(examples=[
252
  [["disaster_girl.jpg", "grumpy.png"], "Next Scene: the camera zooms in, showing the cat walking away from the fire"],
 
272
  num_inference_steps,
273
  height,
274
  width,
 
275
  ],
276
+ outputs=[result, seed, use_output_btn],
 
 
 
 
 
277
 
278
  )
279
 
 
284
  outputs=[input_images]
285
  )
286
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
  if __name__ == "__main__":
289
  demo.launch()