Aduc-sdr-2_5s

Paused

App Files Files Community

x2XcarleX2x commited on Sep 26

Commit

348f29f

verified ·

1 Parent(s): 2d3e403

Update aduc_framework/utils/callbacks.py

Browse files

Files changed (1) hide show

aduc_framework/utils/callbacks.py +20 -41

aduc_framework/utils/callbacks.py CHANGED Viewed

@@ -16,40 +16,40 @@ class DenoiseStepLogger:
     def __init__(self, pipe):
         self.pipe = pipe
         self.intermediate_frames = []
-        self.latents_mean = torch.tensor(pipe.vae.config.latents_mean).view(1, pipe.vae.config.z_dim, 1, 1, 1).to(pipe.device)
-        self.latents_std = 1.0 / torch.tensor(pipe.vae.config.latents_std).view(1, pipe.vae.config.z_dim, 1, 1, 1).to(pipe.device)
     def decode_latents_to_pil(self, latents: torch.Tensor) -> Image.Image:
         """Decodifica um tensor de latents para uma única imagem PIL."""
-        latents = latents.to(self.pipe.vae.dtype)
-        latents = latents / self.latents_std + self.latents_mean
-        decoded_video_tensor = self.pipe.vae.decode(latents, return_dict=False)[0]
-        # Pega o primeiro frame do lote de vídeo decodificado
-        frame_tensor = decoded_video_tensor[0, :, 0, :, :]
-        # Normaliza o tensor de [ -1, 1] para [0, 1]
-        frame_tensor = (frame_tensor / 2 + 0.5).clamp(0, 1)
-        # Converte para array NumPy e depois para imagem PIL
         frame_np = frame_tensor.cpu().permute(1, 2, 0).float().numpy()
         pil_image = Image.fromarray((frame_np * 255).astype(np.uint8))
         return pil_image
     def __call__(self, pipe, step: int, timestep: int, callback_kwargs: dict):
         """
         Esta função é chamada pela pipeline da diffusers em cada passo de denoising.
-        A assinatura está corrigida para aceitar os 5 argumentos padrão.
         """
         print(f"  -> Callback: Capturando frame do passo de denoising {step+1}...")
-        # Extrai o tensor de latents do dicionário `callback_kwargs`
         latents = callback_kwargs["latents"]
         pil_frame = self.decode_latents_to_pil(latents)
         self.intermediate_frames.append(pil_frame)
-        # É uma boa prática retornar o dicionário para a pipeline
         return callback_kwargs
     def save_as_video(self, output_path: str, fps: int = 5):
@@ -57,14 +57,10 @@ class DenoiseStepLogger:
         if not self.intermediate_frames:
             print("  -> Callback: Nenhum frame intermediário para salvar como vídeo.")
             return
         print(f"  -> Callback: Codificando {len(self.intermediate_frames)} frames em vídeo em '{output_path}'...")
-        # Usa um codec de alta compatibilidade e boa qualidade
         writer = imageio.get_writer(output_path, fps=fps, codec='libx264', quality=8, pixelformat='yuv420p')
         for frame in self.intermediate_frames:
             writer.append_data(np.array(frame))
         writer.close()
         print("  -> Callback: Vídeo de depuração salvo com sucesso.")
@@ -75,51 +71,34 @@ class DenoiseStepLogger:
         if not self.intermediate_frames:
             print("  -> Callback: Nenhum frame intermediário para criar a grade.")
             return None
         print(f"  -> Callback: Criando grade de comparação com {len(self.intermediate_frames)} etapas...")
-        # Calcula um layout de grade agradável (o mais quadrado possível)
         num_images = len(self.intermediate_frames)
         cols = math.ceil(math.sqrt(num_images))
         rows = math.ceil(num_images / cols)
         frame_w, frame_h = self.intermediate_frames[0].size
-        grid_w = frame_w * cols
-        grid_h = frame_h * rows
         grid_image = Image.new('RGB', (grid_w, grid_h), (20, 20, 20))
         draw = ImageDraw.Draw(grid_image)
-        # Tenta carregar uma fonte, usa uma padrão se falhar
         try:
-            # Em muitos sistemas Linux/Docker, esta fonte estará disponível
             font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
             if not os.path.exists(font_path):
-                # Fallback para um caminho comum em contêineres Debian
                 font_path = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
             font = ImageFont.truetype(font_path, size=32)
         except IOError:
             print("  -> Callback WARNING: Fonte não encontrada. Usando fonte padrão.")
             font = ImageFont.load_default()
-        # Cola cada frame na grade e desenha a legenda
         for i, frame in enumerate(self.intermediate_frames):
-            x = (i % cols) * frame_w
-            y = (i // cols) * frame_h
             grid_image.paste(frame, (x, y))
             text = f"Passo {i+1}"
             text_origin = (x + 10, y + 10)
             try:
                 text_bbox = draw.textbbox(text_origin, text, font=font)
-            except AttributeError: # Fallback para Pillow < 9.2.0
                 text_w, text_h = draw.textsize(text, font=font)
                 text_bbox = (text_origin[0], text_origin[1], text_origin[0] + text_w, text_origin[1] + text_h)
             rect_coords = (text_bbox[0] - 5, text_bbox[1] - 5, text_bbox[2] + 5, text_bbox[3] + 5)
-            draw.rectangle(rect_coords, fill=(0, 0, 0, 180)) # Fundo preto semi-transparente
             draw.text(text_origin, text, font=font, fill=(255, 255, 255))
         print("  -> Callback: Grade de comparação criada com sucesso.")
         return grid_image

     def __init__(self, pipe):
         self.pipe = pipe
         self.intermediate_frames = []
+        # Mantém os tensores na CPU por padrão durante a inicialização.
+        # Eles serão movidos para o dispositivo correto no momento do uso.
+        self.latents_mean = torch.tensor(pipe.vae.config.latents_mean).view(1, pipe.vae.config.z_dim, 1, 1, 1)
+        self.latents_std = 1.0 / torch.tensor(pipe.vae.config.latents_std).view(1, pipe.vae.config.z_dim, 1, 1, 1)
+    ### INÍCIO DA SEÇÃO CORRIGIDA ###
     def decode_latents_to_pil(self, latents: torch.Tensor) -> Image.Image:
         """Decodifica um tensor de latents para uma única imagem PIL."""
+        # Pega o dispositivo correto do tensor de entrada `latents`
+        correct_device = latents.device
+        # Move os tensores de média e desvio padrão para o mesmo dispositivo dos latents
+        # antes de realizar a operação. Isso evita o erro de "device meta".
+        latents_unscaled = latents / self.latents_std.to(correct_device) + self.latents_mean.to(correct_device)
+        latents_unscaled = latents_unscaled.to(self.pipe.vae.dtype)
+        decoded_video_tensor = self.pipe.vae.decode(latents_unscaled, return_dict=False)[0]
+        frame_tensor = decoded_video_tensor[0, :, 0, :, :]
+        frame_tensor = (frame_tensor / 2 + 0.5).clamp(0, 1)
         frame_np = frame_tensor.cpu().permute(1, 2, 0).float().numpy()
         pil_image = Image.fromarray((frame_np * 255).astype(np.uint8))
         return pil_image
+    ### FIM DA SEÇÃO CORRIGIDA ###
     def __call__(self, pipe, step: int, timestep: int, callback_kwargs: dict):
         """
         Esta função é chamada pela pipeline da diffusers em cada passo de denoising.
         """
         print(f"  -> Callback: Capturando frame do passo de denoising {step+1}...")
         latents = callback_kwargs["latents"]
         pil_frame = self.decode_latents_to_pil(latents)
         self.intermediate_frames.append(pil_frame)
         return callback_kwargs
     def save_as_video(self, output_path: str, fps: int = 5):
         if not self.intermediate_frames:
             print("  -> Callback: Nenhum frame intermediário para salvar como vídeo.")
             return
         print(f"  -> Callback: Codificando {len(self.intermediate_frames)} frames em vídeo em '{output_path}'...")
         writer = imageio.get_writer(output_path, fps=fps, codec='libx264', quality=8, pixelformat='yuv420p')
         for frame in self.intermediate_frames:
             writer.append_data(np.array(frame))
         writer.close()
         print("  -> Callback: Vídeo de depuração salvo com sucesso.")
         if not self.intermediate_frames:
             print("  -> Callback: Nenhum frame intermediário para criar a grade.")
             return None
         print(f"  -> Callback: Criando grade de comparação com {len(self.intermediate_frames)} etapas...")
         num_images = len(self.intermediate_frames)
         cols = math.ceil(math.sqrt(num_images))
         rows = math.ceil(num_images / cols)
         frame_w, frame_h = self.intermediate_frames[0].size
+        grid_w, grid_h = frame_w * cols, frame_h * rows
         grid_image = Image.new('RGB', (grid_w, grid_h), (20, 20, 20))
         draw = ImageDraw.Draw(grid_image)
         try:
             font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
             if not os.path.exists(font_path):
                 font_path = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
             font = ImageFont.truetype(font_path, size=32)
         except IOError:
             print("  -> Callback WARNING: Fonte não encontrada. Usando fonte padrão.")
             font = ImageFont.load_default()
         for i, frame in enumerate(self.intermediate_frames):
+            x, y = (i % cols) * frame_w, (i // cols) * frame_h
             grid_image.paste(frame, (x, y))
             text = f"Passo {i+1}"
             text_origin = (x + 10, y + 10)
             try:
                 text_bbox = draw.textbbox(text_origin, text, font=font)
+            except AttributeError:
                 text_w, text_h = draw.textsize(text, font=font)
                 text_bbox = (text_origin[0], text_origin[1], text_origin[0] + text_w, text_origin[1] + text_h)
             rect_coords = (text_bbox[0] - 5, text_bbox[1] - 5, text_bbox[2] + 5, text_bbox[3] + 5)
+            draw.rectangle(rect_coords, fill=(0, 0, 0, 180))
             draw.text(text_origin, text, font=font, fill=(255, 255, 255))
         print("  -> Callback: Grade de comparação criada com sucesso.")
         return grid_image