Aduc-sdr-2_5s

Paused

App Files Files Community

x2XcarleX2x commited on Sep 26

Commit

91c93ea

verified ·

1 Parent(s): f84d55e

Update aduc_framework/managers/wan_manager.py

Browse files

Files changed (1) hide show

aduc_framework/managers/wan_manager.py +205 -145

aduc_framework/managers/wan_manager.py CHANGED Viewed

@@ -1,7 +1,10 @@
 # aduc_framework/managers/wan_manager.py
-# WanManager v0.0.1 (beta)
 import os
 import tempfile
 import random
 from typing import List, Any, Optional, Tuple
@@ -10,137 +13,187 @@ import numpy as np
 import torch
 from PIL import Image
 from diffusers import FlowMatchEulerDiscreteScheduler
 from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
 from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
 from diffusers.utils.export_utils import export_to_video
 class WanManager:
     """
-    WanManager v0.0.1 (beta)
-    - image: primeiro item (peso fixo 1.0) -> latente 0
-    - handle: segundo item (se presente) -> latente 4, com handle_weight da lista
-    - last: último item -> último latente, com anchor_weight_last da lista
-    - Mantém LoRA Lightning fundida, FlowMatch Euler, device_map='auto' e contrato i2v.
-    - Fallback: se a pipeline não suportar os novos args, chama a API original sem handle/pesos.
     """
     MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
-    # Dimensões
     MAX_DIMENSION = 832
     MIN_DIMENSION = 480
     DIMENSION_MULTIPLE = 16
     SQUARE_SIZE = 480
-    # Vídeo
     FIXED_FPS = 16
     MIN_FRAMES_MODEL = 8
     MAX_FRAMES_MODEL = 81
     default_negative_prompt = (
-        "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，"
-        "JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，"
-        "手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走,过曝，"
     )
     def __init__(self) -> None:
         print("Loading models into memory. This may take a few minutes...")
-        # Pipeline i2v com dois transformadores (alto/baixo ruído)
-        self.pipe = WanImageToVideoPipeline.from_pretrained(
-            self.MODEL_ID,
-            transformer=WanTransformer3DModel.from_pretrained(
-                "cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers",
-                subfolder="transformer",
-                torch_dtype=torch.bfloat16,
-                device_map="auto",
-            ),
-            transformer_2=WanTransformer3DModel.from_pretrained(
-                "cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers",
-                subfolder="transformer_2",
-                torch_dtype=torch.bfloat16,
-                device_map="auto",
-            ),
-            torch_dtype=torch.bfloat16,
-        )
-        # Scheduler
-        self.pipe.scheduler = FlowMatchEulerDiscreteScheduler.from_config(
-            self.pipe.scheduler.config, shift=32.0
         )
-        # LoRA Lightning (fused)
         print("Applying 8-step Lightning LoRA...")
         try:
-            self.pipe.load_lora_weights(
-                "Kijai/WanVideo_comfy",
-                weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
-                adapter_name="lightx2v",
-            )
-            kwargs_lora = {"load_into_transformer_2": True}
-            self.pipe.load_lora_weights(
-                "Kijai/WanVideo_comfy",
-                weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
-                adapter_name="lightx2v_2",
-                **kwargs_lora,
-            )
             self.pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1.0, 1.0])
             print("Fusing LoRA weights into the main model...")
             self.pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3.0, components=["transformer"])
             self.pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1.0, components=["transformer_2"])
             self.pipe.unload_lora_weights()
-            print("Lightning LoRA successfully fused. Model is ready for fast 8-step generation.")
         except Exception as e:
-            print(f"AVISO: Falha ao carregar/fundir LoRA. A geração pode ser mais lenta. Erro: {e}")
         print("All models loaded. Service is ready.")
-    # ===== Utils =====
     def process_image_for_video(self, image: Image.Image) -> Image.Image:
-        width, height = image.size
-        if width == height:
-            return image.resize((self.SQUARE_SIZE, self.SQUARE_SIZE), Image.Resampling.LANCZOS)
-        aspect_ratio = width / height
-        new_width, new_height = width, height
-        if new_width > self.MAX_DIMENSION or new_height > self.MAX_DIMENSION:
-            scale = (self.MAX_DIMENSION / new_width) if aspect_ratio > 1 else (self.MAX_DIMENSION / new_height)
-            new_width *= scale
-            new_height *= scale
-        if new_width < self.MIN_DIMENSION or new_height < self.MIN_DIMENSION:
-            scale = (self.MIN_DIMENSION / new_height) if aspect_ratio > 1 else (self.MIN_DIMENSION / new_width)
-            new_width *= scale
-            new_height *= scale
-        final_width = int(round(new_width / self.DIMENSION_MULTIPLE) * self.DIMENSION_MULTIPLE)
-        final_height = int(round(new_height / self.DIMENSION_MULTIPLE) * self.DIMENSION_MULTIPLE)
-        final_width = max(final_width, self.MIN_DIMENSION if aspect_ratio < 1 else self.SQUARE_SIZE)
-        final_height = max(final_height, self.MIN_DIMENSION if aspect_ratio > 1 else self.SQUARE_SIZE)
-        return image.resize((final_width, final_height), Image.Resampling.LANCZOS)
-    def resize_and_crop_to_match(self, target_image: Image.Image, reference_image: Image.Image) -> Image.Image:
-        ref_width, ref_height = reference_image.size
-        target_width, target_height = target_image.size
-        scale = max(ref_width / target_width, ref_height / target_height)
-        new_width, new_height = int(target_width * scale), int(target_height * scale)
-        resized = target_image.resize((new_width, new_height), Image.Resampling.LANCZOS)
-        left, top = (new_width - ref_width) // 2, (new_height - ref_height) // 2
-        return resized.crop((left, top, left + ref_width, top + ref_height))
-    # ===== API =====
     def generate_video_from_conditions(
         self,
-        images_condition_items: List[List[Any]],  # [[patch(Image), frame(int|str), peso(float)], ...]
         prompt: str,
         negative_prompt: Optional[str],
         duration_seconds: float,
@@ -150,84 +203,91 @@ class WanManager:
         seed: int,
         randomize_seed: bool,
         output_type: str = "np",
-    ) -> Tuple[str, int]:
-        """
-        - Primeiro item: image (peso fixo 1.0) no latente 0.
-        - Segundo item (opcional): handle em latente 4 com peso da lista.
-        - Último item: last no último latente com peso da lista.
-        """
         if not images_condition_items or len(images_condition_items) < 2:
             raise ValueError("Forneça ao menos dois itens (início e fim).")
         items = images_condition_items
-        # image (peso fixo 1.0)
         start_image = items[0][0]
-        # handle (segundo item se houver)
         handle_image = items[1][0] if len(items) >= 3 else None
         handle_weight = float(items[1][2]) if len(items) >= 3 and items[1][2] is not None else 1.0
-        # last (sempre o último item)
-        end_image = items[-1][0]
         end_weight = float(items[-1][2]) if len(items[-1]) >= 3 and items[-1][2] is not None else 1.0
-        if start_image is None or end_image is None:
-            raise ValueError("As imagens inicial e final não podem ser vazias.")
-        if not isinstance(start_image, Image.Image) or not isinstance(end_image, Image.Image):
-            raise TypeError("Os 'patches' devem ser PIL.Image.")
-        if handle_image is not None and not isinstance(handle_image, Image.Image):
-            raise TypeError("O 'patch' do handle deve ser PIL.Image.")
         processed_start = self.process_image_for_video(start_image)
         processed_end = self.resize_and_crop_to_match(end_image, processed_start)
-        processed_handle = self.resize_and_crop_to_match(handle_image, processed_start) if handle_image is not None else None
-        target_height, target_width = processed_start.height, processed_start.width
-        num_frames = int(round(duration_seconds * self.FIXED_FPS))
-        num_frames = int(np.clip(num_frames, self.MIN_FRAMES_MODEL, self.MAX_FRAMES_MODEL))
         current_seed = random.randint(0, np.iinfo(np.int32).max) if randomize_seed else int(seed)
         generator = torch.Generator().manual_seed(current_seed)
         call_kwargs = dict(
-            image=processed_start,              # latente 0 (peso 1.0 implícito)
-            last_image=processed_end,           # último latente (peso ajustável)
-            prompt=prompt,
-            negative_prompt=negative_prompt if negative_prompt is not None else self.default_negative_prompt,
-            height=target_height,
-            width=target_width,
-            num_frames=num_frames,
-            guidance_scale=float(guidance_scale),
-            guidance_scale_2=float(guidance_scale_2),
-            num_inference_steps=int(steps),
-            generator=generator,
-            output_type=output_type,
         )
-        try:
-            if processed_handle is not None:
-                # handle no latente 4 com peso da lista; last no último com end_weight
-                result = self.pipe(
-                    **call_kwargs,
-                    handle_image=processed_handle,
-                    handle_weight=float(handle_weight),
-                    handle_latent_index=25,
-                    anchor_weight_last=float(end_weight),
-                )
-            else:
-                # sem handle; apenas peso do last
-                result = self.pipe(
-                    **call_kwargs,
-                    anchor_weight_last=float(end_weight),
-                )
-        except TypeError:
-            print("[WanManager] handle/anchor args não suportados; usando chamada padrão.")
-            result = self.pipe(**call_kwargs)
         frames = result.frames[0]
         with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
             video_path = tmp.name
         export_to_video(frames, video_path, fps=self.FIXED_FPS)
-        return video_path, current_seed

 # aduc_framework/managers/wan_manager.py
+# WanManager v1.0.0 (production-ready)
 import os
+import platform
+import shutil
+import subprocess
 import tempfile
 import random
 from typing import List, Any, Optional, Tuple
 import torch
 from PIL import Image
+# Habilita TF32 para performance em GPUs Ampere+
+torch.backends.cuda.matmul.allow_tf32 = True
+# SDPA / FlashAttention context
+try:
+    from torch.nn.attention import sdpa_kernel, SDPBackend
+    _SDPA_NEW = True
+except Exception:
+    from torch.backends.cuda import sdp_kernel as _legacy_sdp
+    _SDPA_NEW = False
 from diffusers import FlowMatchEulerDiscreteScheduler
 from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
 from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
 from diffusers.utils.export_utils import export_to_video
+from aduc_framework.utils.callbacks import DenoiseStepLogger
 class WanManager:
     """
+    Gerenciador de produção para a pipeline Wan 2.2 Image-to-Video.
+    Funcionalidades Principais:
+    - **Diagnóstico de Ambiente:** Exibe um banner detalhado no início com informações sobre
+      PyTorch, CUDA, GPUs, e suporte a otimizações (SDPA, xFormers).
+    - **Gerenciamento de Memória:** Distribui o modelo de forma otimizada por múltiplas
+      GPUs, definindo limites de VRAM para evitar sobrecargas.
+    - **Performance Otimizada:** Utiliza LoRA Lightning fundida para geração rápida e
+      aproveita o SDPA (Scaled Dot Product Attention) com uma cadeia de fallback
+      inteligente (Flash -> Efficient -> Math) para máxima velocidade.
+    - **Validação de Parâmetros Robusta:** Implementa regras de negócio para validar e
+      corrigir o número total de frames (`4n+1`) e a posição do frame de controle
+      (`8n+1` com buffers de segurança), garantindo estabilidade e resultados previsíveis.
+    - **Depuração Visual:** Integra um sistema de callbacks para capturar o processo de
+      denoising, gerando um vídeo de depuração e uma grade de imagens com cada passo.
     """
     MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
+    TRANSFORMER_ID = "cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers"
     MAX_DIMENSION = 832
     MIN_DIMENSION = 480
     DIMENSION_MULTIPLE = 16
     SQUARE_SIZE = 480
     FIXED_FPS = 16
     MIN_FRAMES_MODEL = 8
     MAX_FRAMES_MODEL = 81
+    # Prompt negativo padrão em inglês
     default_negative_prompt = (
+        "bright, overexposed, static, blurry details, text, subtitles, watermark, style, "
+        "artwork, painting, still image, gray scale, worst quality, low quality, jpeg artifacts, "
+        "ugly, deformed, disfigured, missing fingers, extra fingers, poorly drawn hands, "
+        "poorly drawn face, malformed limbs, fused fingers, messy background, three legs, "
+        "too many people, walking backwards."
     )
     def __init__(self) -> None:
+        self._print_env_banner()
         print("Loading models into memory. This may take a few minutes...")
+        n_gpus = torch.cuda.device_count()
+        max_memory = {i: "43GiB" for i in range(n_gpus)}
+        max_memory["cpu"] = "120GiB"
+        transformer = WanTransformer3DModel.from_pretrained(
+            self.TRANSFORMER_ID, subfolder="transformer", torch_dtype=torch.bfloat16,
+            device_map="auto", max_memory=max_memory
+        )
+        transformer_2 = WanTransformer3DModel.from_pretrained(
+            self.TRANSFORMER_ID, subfolder="transformer_2", torch_dtype=torch.bfloat16,
+            device_map="auto", max_memory=max_memory
+        )
+        self.pipe = WanImageToVideoPipeline.from_pretrained(
+            self.MODEL_ID, transformer=transformer, transformer_2=transformer_2, torch_dtype=torch.bfloat16
         )
+        self.pipe.scheduler = FlowMatchEulerDiscreteScheduler.from_config(self.pipe.scheduler.config, shift=32.0)
         print("Applying 8-step Lightning LoRA...")
         try:
+            self.pipe.load_lora_weights("Kijai/WanVideo_comfy", weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors", adapter_name="lightx2v")
+            self.pipe.load_lora_weights("Kijai/WanVideo_comfy", weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors", adapter_name="lightx2v_2", load_into_transformer_2=True)
             self.pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1.0, 1.0])
             print("Fusing LoRA weights into the main model...")
             self.pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3.0, components=["transformer"])
             self.pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1.0, components=["transformer_2"])
             self.pipe.unload_lora_weights()
+            print("Lightning LoRA successfully fused.")
         except Exception as e:
+            print(f"[WanManager] AVISO: Falha ao fundir LoRA Lightning: {e}")
         print("All models loaded. Service is ready.")
+    def _print_env_banner(self) -> None:
+        def _safe_get(fn, default="n/a"):
+            try: return fn()
+            except Exception: return default
+        torch_ver = getattr(torch, "__version__", "unknown")
+        cuda_rt = getattr(torch.version, "cuda", "unknown")
+        cudnn_ver = _safe_get(lambda: torch.backends.cudnn.version())
+        cuda_ok = torch.cuda.is_available()
+        n_gpu = torch.cuda.device_count() if cuda_ok else 0
+        devs, total_vram, caps = [], [], []
+        if cuda_ok:
+            for i in range(n_gpu):
+                props = torch.cuda.get_device_properties(i)
+                devs.append(f"cuda:{i} {props.name}")
+                total_vram.append(f"{props.total_memory/1024**3:.1f}GiB")
+                caps.append(f"{props.major}.{props.minor}")
+        try: bf16_supported = torch.cuda.is_bf16_supported()
+        except: bf16_supported = False
+        tf32_allowed = torch.backends.cuda.matmul.allow_tf32
+        sdpa_api = "torch.nn.attention (2.1+)" if _SDPA_NEW else "torch.backends.cuda (2.0)" if not _SDPA_NEW and hasattr(torch.backends.cuda, 'sdp_kernel') else "unavailable"
+        try:
+            import xformers
+            xformers_ok = True
+        except ImportError:
+            xformers_ok = False
+        alloc_conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "unset")
+        visible = os.environ.get("CUDA_VISIBLE_DEVICES", "unset")
+        python_ver = platform.python_version()
+        nvcc = shutil.which("nvcc")
+        nvcc_ver = "n/a"
+        if nvcc:
+            try: nvcc_ver = subprocess.check_output([nvcc, "--version"], text=True).strip().splitlines()[-1]
+            except Exception: nvcc_ver = "n/a"
+        banner_lines = [
+            "================== WAN MANAGER • ENV ==================",
+            f"Python              : {python_ver}", f"PyTorch             : {torch_ver}",
+            f"CUDA (torch)        : {cuda_rt}", f"cuDNN               : {cudnn_ver}",
+            f"CUDA available      : {cuda_ok}", f"GPU count           : {n_gpu}",
+            f"GPUs                : {', '.join(devs) if devs else 'n/a'}",
+            f"GPU VRAM            : {', '.join(total_vram) if total_vram else 'n/a'}",
+            f"Compute Capability  : {', '.join(caps) if caps else 'n/a'}",
+            f"BF16 supported      : {bf16_supported}", f"TF32 allowed        : {tf32_allowed}",
+            f"SDPA API            : {sdpa_api}", f"xFormers available  : {xformers_ok}",
+            f"CUDA_VISIBLE_DEVICES: {visible}", f"PYTORCH_CUDA_ALLOC_CONF: {alloc_conf}",
+            f"nvcc                : {nvcc_ver}",
+            "=======================================================",
+        ]
+        print("\n".join(banner_lines))
+    def _round_multiple(self, x: int, multiple: int) -> int:
+        return int(round(x / multiple) * multiple)
     def process_image_for_video(self, image: Image.Image) -> Image.Image:
+        w, h = image.size
+        if w == h: return image.resize((self.SQUARE_SIZE, self.SQUARE_SIZE), Image.Resampling.LANCZOS)
+        ar = w / h
+        nw, nh = w, h
+        if nw > self.MAX_DIMENSION or nh > self.MAX_DIMENSION:
+            s = (self.MAX_DIMENSION / nw) if ar > 1 else (self.MAX_DIMENSION / nh)
+            nw, nh = nw * s, nh * s
+        if nw < self.MIN_DIMENSION or nh < self.MIN_DIMENSION:
+            s = (self.MIN_DIMENSION / nh) if ar > 1 else (self.MIN_DIMENSION / nw)
+            nw, nh = nw * s, nh * s
+        fw = self._round_multiple(int(nw), self.DIMENSION_MULTIPLE)
+        fh = self._round_multiple(int(nh), self.DIMENSION_MULTIPLE)
+        fw = max(fw, self.MIN_DIMENSION if ar < 1 else self.SQUARE_SIZE)
+        fh = max(fh, self.MIN_DIMENSION if ar > 1 else self.SQUARE_SIZE)
+        return image.resize((fw, fh), Image.Resampling.LANCZOS)
+    def resize_and_crop_to_match(self, target: Image.Image, ref: Image.Image) -> Image.Image:
+        rw, rh = ref.size
+        tw, th = target.size
+        s = max(rw / tw, rh / th)
+        nw, nh = int(tw * s), int(th * s)
+        resized = target.resize((nw, nh), Image.Resampling.LANCZOS)
+        left, top = (nw - rw) // 2, (nh - rh) // 2
+        return resized.crop((left, top, left + rw, top + rh))
     def generate_video_from_conditions(
         self,
+        images_condition_items: List[List[Any]],
         prompt: str,
         negative_prompt: Optional[str],
         duration_seconds: float,
         seed: int,
         randomize_seed: bool,
         output_type: str = "np",
+    ) -> Tuple[str, int, Optional[str], Optional[str]]:
         if not images_condition_items or len(images_condition_items) < 2:
             raise ValueError("Forneça ao menos dois itens (início e fim).")
         items = images_condition_items
         start_image = items[0][0]
+        end_image = items[-1][0]
+        if start_image is None or end_image is None:
+            raise ValueError("As imagens inicial e final não podem ser vazias.")
         handle_image = items[1][0] if len(items) >= 3 else None
         handle_weight = float(items[1][2]) if len(items) >= 3 and items[1][2] is not None else 1.0
         end_weight = float(items[-1][2]) if len(items[-1]) >= 3 and items[-1][2] is not None else 1.0
         processed_start = self.process_image_for_video(start_image)
         processed_end = self.resize_and_crop_to_match(end_image, processed_start)
+        processed_handle = self.resize_and_crop_to_match(handle_image, processed_start) if handle_image else None
+        H, W = processed_start.height, processed_start.width
+        # 1. Calcula e valida o número total de frames
+        initial_frames = int(round(duration_seconds * self.FIXED_FPS))
+        clamped_frames = int(np.clip(initial_frames, self.MIN_FRAMES_MODEL, self.MAX_FRAMES_MODEL))
+        sf_t = getattr(self.pipe, "vae_scale_factor_temporal", 4)
+        num_frames = ((clamped_frames - 1) // sf_t * sf_t) + 1 # Garante o formato 4n+1
+        print(f"[WanManager] INFO: Duração {duration_seconds}s => {initial_frames} frames. "
+              f"Após clamp e alinhamento 4n+1, o total de frames final é {num_frames}.")
         current_seed = random.randint(0, np.iinfo(np.int32).max) if randomize_seed else int(seed)
         generator = torch.Generator().manual_seed(current_seed)
+        denoise_callback = DenoiseStepLogger(self.pipe)
+        callback_kwargs = {"callback_on_step_end": denoise_callback, "callback_on_step_end_tensor_inputs": ["latents"]}
         call_kwargs = dict(
+            image=processed_start, last_image=processed_end, prompt=prompt, negative_prompt=negative_prompt or self.default_negative_prompt,
+            height=H, width=W, num_frames=num_frames, guidance_scale=float(guidance_scale), guidance_scale_2=float(guidance_scale_2),
+            num_inference_steps=int(steps), generator=generator, output_type=output_type,
         )
+        # 2. Calcula e valida o frame de controle (handle)
+        corrected_handle_index = None
+        if processed_handle is not None:
+            handle_frame_ui = int(items[1][1]) if len(items) >= 3 and items[1][1] is not None else 17
+            block_index = round(handle_frame_ui / 8)
+            aligned_frame = (block_index * 8 )+ 1
+            min_safe_frame = 9 # Buffer de 8 frames no início (1*8 + 1)
+            max_safe_frame = num_frames - 9 # Buffer de 8 frames no fim
+            corrected_handle_index = max(min_safe_frame, min(aligned_frame, max_safe_frame))
+            print(f"[WanManager] INFO: Handle Frame UI {handle_frame_ui} alinhado para {aligned_frame} e validado para {corrected_handle_index} (limites seguros: {min_safe_frame}-{max_safe_frame}).")
+        base_kwargs = {**call_kwargs, "anchor_weight_last": float(end_weight)}
+        if processed_handle is not None:
+            base_kwargs.update({
+                "handle_image": processed_handle,
+                "handle_weight": float(handle_weight),
+                "handle_frame_index": corrected_handle_index,
+            })
+        final_kwargs = {**base_kwargs, **callback_kwargs}
+        result = None
+        result = self.pipe(**base_kwargs)
         frames = result.frames[0]
         with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
             video_path = tmp.name
         export_to_video(frames, video_path, fps=self.FIXED_FPS)
+        debug_video_path, grid_image_path = None, None
+        if denoise_callback.intermediate_frames:
+            with tempfile.NamedTemporaryFile(suffix="_denoise_process.mp4", delete=False) as tmp:
+                debug_video_path = tmp.name
+            denoise_callback.save_as_video(debug_video_path, fps=max(1, steps // 2))
+            grid_pil = denoise_callback.create_steps_grid()
+            if grid_pil:
+                with tempfile.NamedTemporaryFile(suffix="_steps_grid.png", delete=False) as tmp:
+                    grid_image_path = tmp.name
+                grid_pil.save(grid_image_path)
+        return video_path, current_seed, debug_video_path, grid_image_path