linoyts HF Staff commited on
Commit
a2cff3a
·
verified ·
1 Parent(s): 1d79e76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -155
app.py CHANGED
@@ -20,7 +20,6 @@ import os
20
  import base64
21
  from io import BytesIO
22
  import json
23
- import time
24
 
25
  SYSTEM_PROMPT = '''
26
  # Edit Instruction Rewriter
@@ -94,16 +93,13 @@ Please strictly follow the rewriting rules below:
94
  NEXT_SCENE_SYSTEM_PROMPT = '''
95
  # Next Scene Prompt Generator
96
  You are a cinematic AI director assistant. Your task is to analyze the provided image and generate a compelling "Next Scene" prompt that describes the natural cinematic progression from the current frame.
97
-
98
  ## Core Principles:
99
  - Think like a film director: Consider camera dynamics, visual composition, and narrative continuity
100
  - Create prompts that flow seamlessly from the current frame
101
  - Focus on **visual progression** rather than static modifications
102
  - Maintain compositional coherence while introducing organic transitions
103
-
104
  ## Prompt Structure:
105
  Always begin with "Next Scene: " followed by your cinematic description.
106
-
107
  ## Key Elements to Include:
108
  1. **Camera Movement**: Specify one of these or combinations:
109
  - Dolly shots (camera moves toward/away from subject)
@@ -112,38 +108,32 @@ Always begin with "Next Scene: " followed by your cinematic description.
112
  - Pan left/right
113
  - Tilt up/down
114
  - Zoom in/out
115
-
116
  2. **Framing Evolution**: Describe how the shot composition changes:
117
  - Wide to close-up transitions
118
  - Angle shifts (high angle to eye level, etc.)
119
  - Reframing of subjects
120
  - Revealing new elements in frame
121
-
122
  3. **Environmental Reveals** (if applicable):
123
  - New characters entering frame
124
  - Expanded scenery
125
  - Spatial progression
126
  - Background elements becoming visible
127
-
128
  4. **Atmospheric Shifts** (if enhancing the scene):
129
  - Lighting changes (golden hour, shadows, lens flare)
130
  - Weather evolution
131
  - Time-of-day transitions
132
  - Depth and mood indicators
133
-
134
  ## Guidelines:
135
  - Keep descriptions concise but vivid (2-3 sentences max)
136
  - Always specify the camera action first
137
  - Focus on what changes between this frame and the next
138
  - Maintain the scene's existing style and mood unless intentionally transitioning
139
  - Prefer natural, organic progressions over abrupt changes
140
-
141
  ## Example Outputs:
142
  - "Next Scene: The camera pulls back from a tight close-up on the airship to a sweeping aerial view, revealing an entire fleet of vessels soaring through a fantasy landscape."
143
  - "Next Scene: The camera tracks forward and tilts down, bringing the sun and helicopters closer into frame as a strong lens flare intensifies."
144
  - "Next Scene: The camera pans right, removing the dragon and rider from view while revealing more of the floating mountain range in the distance."
145
  - "Next Scene: The camera moves slightly forward as sunlight breaks through the clouds, casting a soft glow around the character's silhouette in the mist. Realistic cinematic style, atmospheric depth."
146
-
147
  ## Output Format:
148
  Return ONLY the next scene prompt as plain text, starting with "Next Scene: "
149
  Do NOT include JSON formatting or additional explanations.
@@ -188,157 +178,208 @@ def polish_prompt_hf(prompt, img_list):
188
  result = completion.choices[0].message.content
189
 
190
  # Try to extract JSON if present
191
- if '{"Rewritten"' in result or '"Rewritten"' in result:
192
  try:
193
- result = result.replace('```json', '').replace('```', '').strip()
194
- if result.startswith('{') and result.endswith('}'):
195
- result_json = json.loads(result)
196
- polished_prompt = result_json.get('Rewritten', result)
197
- else:
198
- polished_prompt = result
199
- except Exception as e:
200
- print(f"JSON parsing failed: {e}")
201
  polished_prompt = result
202
  else:
203
  polished_prompt = result
204
 
205
  polished_prompt = polished_prompt.strip().replace("\n", " ")
206
- print(f"Polished prompt from HF: {polished_prompt}")
207
  return polished_prompt
208
 
209
  except Exception as e:
210
  print(f"Error during API call to Hugging Face: {e}")
 
211
  return prompt
212
-
213
- def encode_image(img):
214
- """Encode PIL Image to base64 string."""
215
- buffer = BytesIO()
216
- img.save(buffer, format="PNG")
217
- return base64.b64encode(buffer.getvalue()).decode()
218
-
219
- def suggest_next_scene_prompt_hf(img_list):
220
  """
221
- Generate a cinematic "Next Scene" prompt using Hugging Face InferenceClient.
 
222
  """
 
223
  api_key = os.environ.get("HF_TOKEN")
224
- if not api_key or not img_list:
225
- return ""
226
-
 
 
227
  try:
 
228
  client = InferenceClient(
229
- provider="cerebras",
230
  api_key=api_key,
231
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
 
 
 
 
 
 
 
 
 
 
233
  messages = [
234
- {"role": "system", "content": NEXT_SCENE_SYSTEM_PROMPT},
235
- {"role": "user", "content": []}
 
 
 
236
  ]
237
-
238
- for img in img_list:
239
- messages[1]["content"].append(
240
- {"image": f"data:image/png;base64,{encode_image(img)}"})
241
- messages[1]["content"].append({"text": "Generate a natural next scene prompt for this image."})
242
-
243
  completion = client.chat.completions.create(
244
- model="Qwen/Qwen3-235B-A22B-Instruct-2507",
245
  messages=messages,
246
  )
247
 
248
- result = completion.choices[0].message.content.strip()
249
- print(f"Generated Next Scene prompt: {result}")
250
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
  except Exception as e:
253
- print(f"Error generating next scene prompt: {e}")
254
- return ""
 
255
 
256
- def suggest_next_scene_prompt(images):
257
- """
258
- Wrapper function to generate next scene prompt from image gallery.
259
- """
260
- if not images:
261
- return ""
262
-
263
- pil_images = []
264
- for item in images:
265
- try:
266
- if isinstance(item[0], Image.Image):
267
- pil_images.append(item[0].convert("RGB"))
268
- elif isinstance(item[0], str):
269
- pil_images.append(Image.open(item[0]).convert("RGB"))
270
- elif hasattr(item, "name"):
271
- pil_images.append(Image.open(item.name).convert("RGB"))
272
- except Exception as e:
273
- print(f"Error processing image: {e}")
274
- continue
275
-
276
- if not pil_images:
277
- return ""
278
-
279
- return suggest_next_scene_prompt_hf(pil_images)
280
 
281
  # --- Model Loading ---
282
  dtype = torch.bfloat16
283
  device = "cuda" if torch.cuda.is_available() else "cpu"
284
 
285
- pipe = QwenImageEditPlusPipeline.from_pretrained("Phr00t/Qwen-Image-Edit-Rapid-AIO", torch_dtype=dtype).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  pipe.transformer.__class__ = QwenImageTransformer2DModel
287
  pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
288
 
289
  # --- Ahead-of-time compilation ---
290
- optimize_pipeline_(pipe, image=Image.new("RGB", (1024, 1024)), prompt="prompt")
291
 
292
- # --- Constants ---
293
  MAX_SEED = np.iinfo(np.int32).max
294
 
295
- # --- Helper Functions ---
296
  def use_output_as_input(output_images):
297
- """Convert the first image from the result gallery to input format."""
298
- if output_images and len(output_images) > 0:
299
- # output_images is a list of images
300
- first_image = output_images[0]
301
- # Return in the format expected by the Gallery: list of tuples
302
- return [first_image]
303
- return None
304
-
305
- def update_history(new_images, history):
306
- """Updates the history gallery with new images."""
307
- time.sleep(0.5) # Small delay to ensure images are ready
308
- if history is None:
309
- history = []
310
- if new_images is not None and len(new_images) > 0:
311
- # Convert to list if needed
312
- if not isinstance(history, list):
313
- history = list(history) if history else []
314
- # Add all new images to the beginning of history
315
- for img in new_images:
316
- history.insert(0, img)
317
- # Keep only the last 20 images in history
318
- history = history[:20]
319
- return history
320
-
321
- def use_history_as_input(evt: gr.SelectData):
322
- """Sets the selected history image as the new input image."""
323
- # evt.value contains the selected image
324
- if evt.value is not None:
325
- return [evt.value]
326
- return None
327
-
328
- # --- Inference Function ---
329
- @spaces.GPU
330
  def infer(
331
- images,
332
- prompt,
333
- seed=42,
334
- randomize_seed=False,
335
- true_guidance_scale=1.0,
336
- num_inference_steps=8,
337
  height=None,
338
  width=None,
339
- rewrite_prompt=False,
340
- num_images_per_prompt=1
 
341
  ):
 
 
 
 
342
  negative_prompt = " "
343
 
344
  if randomize_seed:
@@ -428,22 +469,6 @@ with gr.Blocks(css=css) as demo:
428
  result = gr.Gallery(label="Result", show_label=False, type="pil")
429
  # Add this button right after the result gallery - initially hidden
430
  use_output_btn = gr.Button("↗️ Use as input", variant="secondary", size="sm", visible=False)
431
-
432
- # Add history section
433
- gr.Markdown("---")
434
- with gr.Row():
435
- gr.Markdown("### 📜 History")
436
- clear_history_button = gr.Button("🗑️ Clear History", size="sm", variant="stop")
437
-
438
- history_gallery = gr.Gallery(
439
- label="Click any image to use as input",
440
- columns=4,
441
- rows=2,
442
- object_fit="contain",
443
- height="auto",
444
- interactive=False,
445
- show_label=True
446
- )
447
 
448
  with gr.Row():
449
  prompt = gr.Text(
@@ -506,7 +531,6 @@ with gr.Blocks(css=css) as demo:
506
 
507
  # gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=False)
508
 
509
- # Main generation events
510
  gr.on(
511
  triggers=[run_button.click, prompt.submit],
512
  fn=infer,
@@ -521,35 +545,15 @@ with gr.Blocks(css=css) as demo:
521
  width,
522
  rewrite_prompt,
523
  ],
524
- outputs=[result, seed, use_output_btn],
525
- ).then(
526
- fn=update_history,
527
- inputs=[result, history_gallery],
528
- outputs=history_gallery,
529
- show_api=False
530
  )
531
 
532
- # Add the event handler for the "Use Output as Input" button
533
  use_output_btn.click(
534
  fn=use_output_as_input,
535
  inputs=[result],
536
  outputs=[input_images]
537
  )
538
-
539
- # History gallery select handler
540
- history_gallery.select(
541
- fn=use_history_as_input,
542
- outputs=[input_images],
543
- show_api=False
544
- )
545
-
546
- # Clear history button
547
- clear_history_button.click(
548
- fn=lambda: [],
549
- inputs=None,
550
- outputs=history_gallery,
551
- show_api=False
552
- )
553
 
554
  input_images.change(fn=suggest_next_scene_prompt, inputs=[input_images], outputs=[prompt])
555
 
 
20
  import base64
21
  from io import BytesIO
22
  import json
 
23
 
24
  SYSTEM_PROMPT = '''
25
  # Edit Instruction Rewriter
 
93
  NEXT_SCENE_SYSTEM_PROMPT = '''
94
  # Next Scene Prompt Generator
95
  You are a cinematic AI director assistant. Your task is to analyze the provided image and generate a compelling "Next Scene" prompt that describes the natural cinematic progression from the current frame.
 
96
  ## Core Principles:
97
  - Think like a film director: Consider camera dynamics, visual composition, and narrative continuity
98
  - Create prompts that flow seamlessly from the current frame
99
  - Focus on **visual progression** rather than static modifications
100
  - Maintain compositional coherence while introducing organic transitions
 
101
  ## Prompt Structure:
102
  Always begin with "Next Scene: " followed by your cinematic description.
 
103
  ## Key Elements to Include:
104
  1. **Camera Movement**: Specify one of these or combinations:
105
  - Dolly shots (camera moves toward/away from subject)
 
108
  - Pan left/right
109
  - Tilt up/down
110
  - Zoom in/out
 
111
  2. **Framing Evolution**: Describe how the shot composition changes:
112
  - Wide to close-up transitions
113
  - Angle shifts (high angle to eye level, etc.)
114
  - Reframing of subjects
115
  - Revealing new elements in frame
 
116
  3. **Environmental Reveals** (if applicable):
117
  - New characters entering frame
118
  - Expanded scenery
119
  - Spatial progression
120
  - Background elements becoming visible
 
121
  4. **Atmospheric Shifts** (if enhancing the scene):
122
  - Lighting changes (golden hour, shadows, lens flare)
123
  - Weather evolution
124
  - Time-of-day transitions
125
  - Depth and mood indicators
 
126
  ## Guidelines:
127
  - Keep descriptions concise but vivid (2-3 sentences max)
128
  - Always specify the camera action first
129
  - Focus on what changes between this frame and the next
130
  - Maintain the scene's existing style and mood unless intentionally transitioning
131
  - Prefer natural, organic progressions over abrupt changes
 
132
  ## Example Outputs:
133
  - "Next Scene: The camera pulls back from a tight close-up on the airship to a sweeping aerial view, revealing an entire fleet of vessels soaring through a fantasy landscape."
134
  - "Next Scene: The camera tracks forward and tilts down, bringing the sun and helicopters closer into frame as a strong lens flare intensifies."
135
  - "Next Scene: The camera pans right, removing the dragon and rider from view while revealing more of the floating mountain range in the distance."
136
  - "Next Scene: The camera moves slightly forward as sunlight breaks through the clouds, casting a soft glow around the character's silhouette in the mist. Realistic cinematic style, atmospheric depth."
 
137
  ## Output Format:
138
  Return ONLY the next scene prompt as plain text, starting with "Next Scene: "
139
  Do NOT include JSON formatting or additional explanations.
 
178
  result = completion.choices[0].message.content
179
 
180
  # Try to extract JSON if present
181
+ if '{"Rewritten"' in result:
182
  try:
183
+ # Clean up the response
184
+ result = result.replace('```json', '').replace('```', '')
185
+ result_json = json.loads(result)
186
+ polished_prompt = result_json.get('Rewritten', result)
187
+ except:
 
 
 
188
  polished_prompt = result
189
  else:
190
  polished_prompt = result
191
 
192
  polished_prompt = polished_prompt.strip().replace("\n", " ")
 
193
  return polished_prompt
194
 
195
  except Exception as e:
196
  print(f"Error during API call to Hugging Face: {e}")
197
+ # Fallback to original prompt if enhancement fails
198
  return prompt
199
+
200
+ def next_scene_prompt(original_prompt, img_list):
 
 
 
 
 
 
201
  """
202
+ Rewrites the prompt using a Hugging Face InferenceClient.
203
+ Supports multiple images via img_list.
204
  """
205
+ # Ensure HF_TOKEN is set
206
  api_key = os.environ.get("HF_TOKEN")
207
+ if not api_key:
208
+ print("Warning: HF_TOKEN not set. Falling back to original prompt.")
209
+ return original_prompt
210
+ prompt = f"{NEXT_SCENE_SYSTEM_PROMPT}"
211
+ system_prompt = "you are a helpful assistant, you should provide useful answers to users."
212
  try:
213
+ # Initialize the client
214
  client = InferenceClient(
215
+ provider="nebius",
216
  api_key=api_key,
217
  )
218
+
219
+ # Convert list of images to base64 data URLs
220
+ image_urls = []
221
+ if img_list is not None:
222
+ # Ensure img_list is actually a list
223
+ if not isinstance(img_list, list):
224
+ img_list = [img_list]
225
+
226
+ for img in img_list:
227
+ image_url = None
228
+ # If img is a PIL Image
229
+ if hasattr(img, 'save'): # Check if it's a PIL Image
230
+ buffered = BytesIO()
231
+ img.save(buffered, format="PNG")
232
+ img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
233
+ image_url = f"data:image/png;base64,{img_base64}"
234
+ # If img is already a file path (string)
235
+ elif isinstance(img, str):
236
+ with open(img, "rb") as image_file:
237
+ img_base64 = base64.b64encode(image_file.read()).decode('utf-8')
238
+ image_url = f"data:image/png;base64,{img_base64}"
239
+ else:
240
+ print(f"Warning: Unexpected image type: {type(img)}, skipping...")
241
+ continue
242
+
243
+ if image_url:
244
+ image_urls.append(image_url)
245
+
246
+ # Build the content array with text first, then all images
247
+ content = [
248
+ {
249
+ "type": "text",
250
+ "text": prompt
251
+ }
252
+ ]
253
 
254
+ # Add all images to the content
255
+ for image_url in image_urls:
256
+ content.append({
257
+ "type": "image_url",
258
+ "image_url": {
259
+ "url": image_url
260
+ }
261
+ })
262
+
263
+ # Format the messages for the chat completions API
264
  messages = [
265
+ {"role": "system", "content": system_prompt},
266
+ {
267
+ "role": "user",
268
+ "content": content
269
+ }
270
  ]
271
+
272
+ # Call the API
 
 
 
 
273
  completion = client.chat.completions.create(
274
+ model="Qwen/Qwen2.5-VL-72B-Instruct",
275
  messages=messages,
276
  )
277
 
278
+ # Parse the response
279
+ result = completion.choices[0].message.content
280
+
281
+ # Try to extract JSON if present
282
+ if '"Rewritten"' in result:
283
+ try:
284
+ # Clean up the response
285
+ result = result.replace('```json', '').replace('```', '')
286
+ result_json = json.loads(result)
287
+ polished_prompt = result_json.get('Rewritten', result)
288
+ except:
289
+ polished_prompt = result
290
+ else:
291
+ polished_prompt = result
292
+
293
+ polished_prompt = polished_prompt.strip().replace("\n", " ")
294
+ return polished_prompt
295
 
296
  except Exception as e:
297
+ print(f"Error during API call to Hugging Face: {e}")
298
+ # Fallback to original prompt if enhancement fails
299
+ return original_prompt
300
 
301
+
302
+
303
+ def encode_image(pil_image):
304
+ import io
305
+ buffered = io.BytesIO()
306
+ pil_image.save(buffered, format="PNG")
307
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
  # --- Model Loading ---
310
  dtype = torch.bfloat16
311
  device = "cuda" if torch.cuda.is_available() else "cpu"
312
 
313
+ pipe = QwenImageEditPlusPipeline.from_pretrained("Qwen/Qwen-Image-Edit-2509",
314
+ transformer= QwenImageTransformer2DModel.from_pretrained("linoyts/Qwen-Image-Edit-Rapid-AIO",
315
+ subfolder='transformer',
316
+ torch_dtype=dtype,
317
+ device_map='cuda'),torch_dtype=dtype).to(device)
318
+
319
+ pipe.load_lora_weights(
320
+ "lovis93/next-scene-qwen-image-lora-2509",
321
+ weight_name="next-scene_lora-v2-3000.safetensors", adapter_name="next-scene"
322
+ )
323
+ pipe.set_adapters(["next-scene"], adapter_weights=[1.])
324
+ pipe.fuse_lora(adapter_names=["next-scene"], lora_scale=1.)
325
+ pipe.unload_lora_weights()
326
+
327
+
328
+ # Apply the same optimizations from the first version
329
  pipe.transformer.__class__ = QwenImageTransformer2DModel
330
  pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
331
 
332
  # --- Ahead-of-time compilation ---
333
+ optimize_pipeline_(pipe, image=[Image.new("RGB", (1024, 1024)), Image.new("RGB", (1024, 1024))], prompt="prompt")
334
 
335
+ # --- UI Constants and Helpers ---
336
  MAX_SEED = np.iinfo(np.int32).max
337
 
 
338
  def use_output_as_input(output_images):
339
+ """Convert output images to input format for the gallery"""
340
+ if output_images is None or len(output_images) == 0:
341
+ return []
342
+ return output_images
343
+
344
+ def suggest_next_scene_prompt(images):
345
+ pil_images = []
346
+ if images is not None:
347
+ for item in images:
348
+ try:
349
+ if isinstance(item[0], Image.Image):
350
+ pil_images.append(item[0].convert("RGB"))
351
+ elif isinstance(item[0], str):
352
+ pil_images.append(Image.open(item[0]).convert("RGB"))
353
+ elif hasattr(item, "name"):
354
+ pil_images.append(Image.open(item.name).convert("RGB"))
355
+ except Exception:
356
+ continue
357
+ if len(pil_images) > 0:
358
+ prompt = next_scene_prompt("", pil_images)
359
+ else:
360
+ prompt = ""
361
+ print("next scene prompt: ", prompt)
362
+ return prompt
363
+
364
+ # --- Main Inference Function (with hardcoded negative prompt) ---
365
+ @spaces.GPU(duration=300)
 
 
 
 
 
 
366
  def infer(
367
+ images,
368
+ prompt,
369
+ seed=42,
370
+ randomize_seed=False,
371
+ true_guidance_scale=1.0,
372
+ num_inference_steps=4,
373
  height=None,
374
  width=None,
375
+ rewrite_prompt=True,
376
+ num_images_per_prompt=1,
377
+ progress=gr.Progress(track_tqdm=True),
378
  ):
379
+ """
380
+ Generates an image using the local Qwen-Image diffusers pipeline.
381
+ """
382
+ # Hardcode the negative prompt as requested
383
  negative_prompt = " "
384
 
385
  if randomize_seed:
 
469
  result = gr.Gallery(label="Result", show_label=False, type="pil")
470
  # Add this button right after the result gallery - initially hidden
471
  use_output_btn = gr.Button("↗️ Use as input", variant="secondary", size="sm", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
  with gr.Row():
474
  prompt = gr.Text(
 
531
 
532
  # gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=False)
533
 
 
534
  gr.on(
535
  triggers=[run_button.click, prompt.submit],
536
  fn=infer,
 
545
  width,
546
  rewrite_prompt,
547
  ],
548
+ outputs=[result, seed, use_output_btn], # Added use_output_btn to outputs
 
 
 
 
 
549
  )
550
 
551
+ # Add the new event handler for the "Use Output as Input" button
552
  use_output_btn.click(
553
  fn=use_output_as_input,
554
  inputs=[result],
555
  outputs=[input_images]
556
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
 
558
  input_images.change(fn=suggest_next_scene_prompt, inputs=[input_images], outputs=[prompt])
559