Aduc-sdr-2_5s

Paused

App Files Files Community

euIaxs22 commited on Oct 2

Commit

318739d

verified ·

1 Parent(s): 70c3e5d

Update app_ltx.py

Browse files

Files changed (1) hide show

app_ltx.py +72 -133

app_ltx.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import torch
 import numpy as np
 import random
 import os
@@ -14,42 +14,27 @@ import shutil
 import sys
 # --- SETUP INICIAL: GARANTIR QUE A BIBLIOTECA LTX-VIDEO ESTEJA ACESSÍVEL ---
-# O Dockerfile deve clonar o repositório para /opt/LTX-Video
-LTX_REPO_PATH = Path("/data/LTX-Video")
-if not LTX_REPO_PATH.exists():
-    # Fallback se o Dockerfile não clonou, tenta clonar agora.
-    print(f"Diretório {LTX_REPO_PATH} não encontrado. Tentando clonar...")
-    try:
-        subprocess.run(["git", "clone", "--depth", "1", "https://github.com/Lightricks/LTX-Video", str(LTX_REPO_PATH)], check=True)
-    except Exception as e:
-        print(f"ERRO FATAL: Falha ao clonar o repositório LTX-Video. {e}")
-        raise
-if str(LTX_REPO_PATH) not in sys.path:
-    # Adiciona o diretório clonado ao sys.path para permitir os imports
     sys.path.insert(0, str(LTX_REPO_PATH))
     print(f"Adicionado '{LTX_REPO_PATH}' ao sys.path.")
-# Agora, importa as funções e classes do repositório LTX-Video
 try:
-    from ltx_video.inference import (
-        create_ltx_video_pipeline,
-        create_latent_upsampler,
-        seed_everething,
-        calculate_padding,
-    )
-    from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
     from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
-    from diffusers.utils import export_to_video, load_image, load_video
-    from ltx_video.pipelines.pipeline_ltx_condition import LTXVideoCondition
 except ImportError as e:
-    print(f"ERRO FATAL: Falha ao importar módulos do LTX-Video. Verifique a instalação do repositório. Erro: {e}")
     raise
 # --- CARREGAMENTO GLOBAL DOS MODELOS E CONFIGURAÇÕES ---
 APP_HOME = Path(os.environ.get("APP_HOME", "/app"))
 CONFIG_FILE_PATH = APP_HOME / "configs" / "ltxv-13b-0.9.8-distilled-fp8.yaml"
 MODELS_DIR = Path("/data/ltx_models_official")
@@ -62,153 +47,107 @@ with open(CONFIG_FILE_PATH, "r") as file:
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else torch.float16
-# --- Baixa os modelos necessários (idempotente) ---
-print(f"Verificando e baixando arquivos de modelo para '{MODELS_DIR}'...")
-# 1. Baixa os arquivos de pesos principais
 for key in ["checkpoint_path", "spatial_upscaler_model_path"]:
     filename = PIPELINE_CONFIG_YAML.get(key)
     if filename and not (MODELS_DIR / filename).exists():
-        print(f"Baixando {filename}...")
         hf_hub_download(repo_id="Lightricks/LTX-Video", filename=filename, local_dir=str(MODELS_DIR), token=os.getenv("HF_TOKEN"))
-# 2. Baixa os componentes de apoio (VAE, Text Encoder, etc.)
-snapshot_download(repo_id="Lightricks/LTX-Video", local_dir=str(MODELS_DIR),
-                  allow_patterns=["text_encoder/*", "tokenizer/*", "vae/*", "scheduler/*"],
-                  token=os.getenv("HF_TOKEN"))
-print("Arquivos de modelo verificados/baixados.")
-# --- Monta as Pipelines (uma única vez, mantendo-as "quentes") ---
-print("Montando pipelines LTX-Video na memória...")
-# Modifica a config em memória para usar o caminho local dos componentes
-PIPELINE_CONFIG_YAML["text_encoder_model_name_or_path"] = str(MODELS_DIR)
-pipeline_instance = create_ltx_video_pipeline(
-    ckpt_path=str(MODELS_DIR / PIPELINE_CONFIG_YAML["checkpoint_path"]),
-    precision=PIPELINE_CONFIG_YAML["precision"],
-    text_encoder_model_name_or_path=PIPELINE_CONFIG_YAML["text_encoder_model_name_or_path"],
-    sampler=PIPELINE_CONFIG_YAML["sampler"],
-    device="cpu", # Carrega na CPU primeiro para economizar VRAM durante a inicialização
-)
-latent_upsampler_instance = create_latent_upsampler(
-    latent_upsampler_model_path=str(MODELS_DIR / PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"]),
-    device="cpu"
 )
-print(f"Movendo pipelines para o dispositivo: {DEVICE}...")
-pipeline_instance.to(DEVICE)
-latent_upsampler_instance.to(DEVICE)
 pipeline_instance.vae.enable_tiling()
-print("✅ Pipelines montadas e prontas na GPU.")
-# --- FUNÇÃO DE GERAÇÃO PRINCIPAL (CALLBACK DO GRADIO) ---
-def round_to_nearest_resolution(height, width):
-    ratio = pipeline_instance.vae.spatial_compression_ratio
-    height = height - (height % ratio)
-    width = width - (width % ratio)
-    return int(height), int(width)
 def generate(
-    prompt: str,
-    image_input: Optional[str],
-    target_height: int,
-    target_width: int,
-    num_frames: int,
-    seed: int,
-    guidance_scale: float,
-    num_inference_steps: int,
-    denoise_strength: float,
     progress=gr.Progress(track_tqdm=True)
 ):
-    if not image_input and not prompt:
-        raise gr.Error("Por favor, forneça uma imagem de entrada ou um prompt de texto.")
     seed_everething(seed)
     generator = torch.Generator(device=DEVICE).manual_seed(seed)
-    conditions = None
     if image_input:
         progress(0.1, desc="Preparando imagem de condição...")
-        image = load_image(image_input)
-        video_condition_input = load_video(export_to_video([image]))
-        condition = ConditioningItem(video_condition_input.to(DEVICE), 0, 1.0)
-        conditions = [condition]
-    # --- LÓGICA MULTI-ESCALA ---
     multi_scale_pipeline = LTXMultiScalePipeline(pipeline_instance, latent_upsampler_instance)
-    # Prepara os argumentos com base no YAML e na UI
-    first_pass_args = PIPELINE_CONFIG_YAML.get("first_pass", {}).copy()
-    second_pass_args = PIPELINE_CONFIG_YAML.get("second_pass", {}).copy()
-    # Sobrescreve com os valores da UI onde faz sentido
-    # Se o YAML tiver uma lista para guidance_scale, respeitamos isso. Se não, usamos o valor da UI.
-    if not isinstance(first_pass_args.get("guidance_scale"), list):
-        first_pass_args["guidance_scale"] = guidance_scale
-    if not isinstance(second_pass_args.get("guidance_scale"), list):
-        second_pass_args["guidance_scale"] = guidance_scale
-    first_pass_args["num_inference_steps"] = num_inference_steps
-    second_pass_args["denoise_strength"] = denoise_strength
     call_kwargs = {
-        "prompt": prompt,
-        "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
         "height": target_height, "width": target_width, "num_frames": num_frames,
         "generator": generator, "output_type": "pt",
-        "conditioning_items": conditions,
-        "decode_timestep": PIPELINE_CONFIG_YAML["decode_timestep"],
-        "decode_noise_scale": PIPELINE_CONFIG_YAML["decode_noise_scale"],
-        "downscale_factor": PIPELINE_CONFIG_YAML["downscale_factor"],
-        "first_pass": first_pass_args,
-        "second_pass": second_pass_args,
     }
-    print("[LTX App] Executando pipeline multi-escala...")
-    progress(0.3, desc="Gerando vídeo (pode levar alguns minutos)...")
     result_tensor = multi_scale_pipeline(**call_kwargs).images
-    # --- ETAPA FINAL: Exportar para vídeo ---
-    progress(0.9, desc="Exportando para arquivo de vídeo...")
     output_video_path = tempfile.mktemp(suffix=".mp4")
     video_np = result_tensor[0].permute(1, 2, 3, 0).cpu().float().numpy()
     video_np = np.clip(video_np * 255, 0, 255).astype("uint8")
     export_to_video(video_np, str(output_video_path), fps=24)
-    print(f"Vídeo gerado com sucesso em: {output_video_path}")
     return output_video_path
 # --- UI GRADIO ---
-with gr.Blocks(title="LTX-Video (Correto)", theme=gr.themes.Soft()) as demo:
-    gr.HTML("<h1>LTX-Video - Geração de Vídeo Multi-Scale (FP8)</h1><p>Implementação final usando a API nativa do LTX-Video.</p>")
     with gr.Row():
         with gr.Column(scale=1):
-            image_in = gr.Image(type="filepath", label="Imagem de Entrada (Opcional para txt2vid)")
-            prompt_in = gr.Textbox(label="Prompt", lines=4, placeholder="Ex: a cinematic shot of a majestic lion walking in the savanna, 4k, high quality")
-            with gr.Accordion("Parâmetros Principais", open=True):
-                with gr.Row():
-                    height_in = gr.Slider(label="Altura Final (Height)", minimum=256, maximum=1024, step=32, value=480)
-                    width_in = gr.Slider(label="Largura Final (Width)", minimum=256, maximum=1280, step=32, value=832)
-                with gr.Row():
-                    frames_in = gr.Slider(label="Número de Frames", minimum=17, maximum=161, step=8, value=97, info="Deve ser um múltiplo de 8 + 1.")
-                    seed_in = gr.Number(label="Seed", value=42, precision=0)
-            with gr.Accordion("Parâmetros Avançados", open=False):
-                 num_inference_steps_in = gr.Slider(label="Passos de Inferência (Etapa 1)", minimum=4, maximum=50, step=1, value=30)
-                 guidance_scale_in = gr.Slider(label="Força do Guia (Guidance)", minimum=1.0, maximum=10.0, step=0.5, value=1.0, info="Para modelos 'distilled', o valor recomendado é 1.0.")
-                 denoise_strength_in = gr.Slider(label="Força do Refinamento (Denoise)", minimum=0.1, maximum=1.0, step=0.05, value=0.5, info="Controla a intensidade da Etapa 3 (refinamento).")
             run_button = gr.Button("Gerar Vídeo", variant="primary")
         with gr.Column(scale=1):
             video_out = gr.Video(label="Vídeo Gerado")
     run_button.click(
         fn=generate,
-        inputs=[prompt_in, image_in, height_in, width_in, frames_in, seed_in, guidance_scale_in, num_inference_steps_in, denoise_strength_in],
         outputs=[video_out],
     )

 import gradio as gr
 import torch
+import spaces
 import numpy as np
 import random
 import os
 import sys
 # --- SETUP INICIAL: GARANTIR QUE A BIBLIOTECA LTX-VIDEO ESTEJA ACESSÍVEL ---
+# O Dockerfile deve ter clonado e instalado o repositório em /opt/LTX-Video
+LTX_REPO_PATH = Path("/opt/LTX-Video")
+if LTX_REPO_PATH.exists() and str(LTX_REPO_PATH) not in sys.path:
     sys.path.insert(0, str(LTX_REPO_PATH))
     print(f"Adicionado '{LTX_REPO_PATH}' ao sys.path.")
+# ====================================================================
+# <<< IMPORTAÇÕES CORRIGIDAS, EXATAMENTE COMO VOCÊ PEDIU >>>
 try:
+    from ltx_video.pipelines.pipeline_ltx_video import LTXVideoPipeline, ConditioningItem, LTXMultiScalePipeline
+    from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler
     from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+    from diffusers.utils import export_to_video, load_image
+    from ltx_video.inference import seed_everething, calculate_padding, load_media_file
 except ImportError as e:
+    print(f"ERRO FATAL: Falha ao importar módulos do LTX-Video. Verifique a instalação. Erro: {e}")
     raise
+# ====================================================================
 # --- CARREGAMENTO GLOBAL DOS MODELOS E CONFIGURAÇÕES ---
 APP_HOME = Path(os.environ.get("APP_HOME", "/app"))
 CONFIG_FILE_PATH = APP_HOME / "configs" / "ltxv-13b-0.9.8-distilled-fp8.yaml"
 MODELS_DIR = Path("/data/ltx_models_official")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else torch.float16
+print(f"Verificando e baixando modelos para '{MODELS_DIR}'...")
+# Baixa os arquivos de pesos principais
 for key in ["checkpoint_path", "spatial_upscaler_model_path"]:
     filename = PIPELINE_CONFIG_YAML.get(key)
     if filename and not (MODELS_DIR / filename).exists():
         hf_hub_download(repo_id="Lightricks/LTX-Video", filename=filename, local_dir=str(MODELS_DIR), token=os.getenv("HF_TOKEN"))
+# Baixa os componentes de apoio
+snapshot_download(repo_id=PIPELINE_CONFIG_YAML["text_encoder_model_name_or_path"], local_dir=str(MODELS_DIR / "text_encoder"), token=os.getenv("HF_TOKEN"))
+print("Modelos verificados/baixados.")
+print("Montando pipelines LTX-Video...")
+# Carrega os componentes individualmente para montar a pipeline
+from transformers import T5EncoderModel, T5Tokenizer
+from ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
+from ltx_video.models.transformers.transformer3d import Transformer3DModel
+from ltx_video.schedulers.rf import RectifiedFlowScheduler
+from ltx_video.models.transformers.symmetric_patchifier import SymmetricPatchifier
+transformer = Transformer3DModel.from_pretrained(str(MODELS_DIR / PIPELINE_CONFIG_YAML["checkpoint_path"])).to(DEVICE, dtype=DTYPE)
+vae = CausalVideoAutoencoder.from_pretrained(str(MODELS_DIR / "vae")).to(DEVICE, dtype=DTYPE)
+text_encoder = T5EncoderModel.from_pretrained(str(MODELS_DIR / "text_encoder")).to(DEVICE, dtype=DTYPE)
+tokenizer = T5Tokenizer.from_pretrained(str(MODELS_DIR / "text_encoder"))
+scheduler = RectifiedFlowScheduler.from_pretrained(str(MODELS_DIR / "scheduler"))
+pipeline_instance = LTXVideoPipeline(
+    vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, transformer=transformer, scheduler=scheduler, patchifier=SymmetricPatchifier(patch_size=1)
 )
+latent_upsampler_instance = LatentUpsampler.from_pretrained(str(MODELS_DIR / PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"])).to(DEVICE, dtype=DTYPE)
 pipeline_instance.vae.enable_tiling()
+print("✅ Pipelines prontas na GPU.")
+# --- FUNÇÃO DE GERAÇÃO PRINCIPAL ---
+@spaces.GPU
 def generate(
+    prompt: str, image_input: Optional[str],
+    target_height: int, target_width: int, num_frames: int, seed: int,
     progress=gr.Progress(track_tqdm=True)
 ):
     seed_everething(seed)
     generator = torch.Generator(device=DEVICE).manual_seed(seed)
+    height_padded = ((target_height - 1) // 32 + 1) * 32
+    width_padded = ((target_width - 1) // 32 + 1) * 32
+    padding_values = calculate_padding(target_height, target_width, height_padded, width_padded)
+    conditioning_items = None
     if image_input:
         progress(0.1, desc="Preparando imagem de condição...")
+        media_tensor = load_media_file(
+            media_path=image_input, height=target_height, width=target_width,
+            max_frames=1, padding=padding_values, just_crop=True
+        )
+        conditioning_items = [ConditioningItem(media_tensor.to(DEVICE, dtype=DTYPE), 0, 1.0)]
     multi_scale_pipeline = LTXMultiScalePipeline(pipeline_instance, latent_upsampler_instance)
     call_kwargs = {
+        "prompt": prompt, "negative_prompt": "worst quality...",
         "height": target_height, "width": target_width, "num_frames": num_frames,
         "generator": generator, "output_type": "pt",
+        "conditioning_items": conditioning_items,
+        **PIPELINE_CONFIG_YAML
     }
+    progress(0.3, desc="Gerando vídeo...")
     result_tensor = multi_scale_pipeline(**call_kwargs).images
+    pad_left, pad_right, pad_top, pad_bottom = padding_values
+    slice_h_end = -pad_bottom if pad_bottom > 0 else None
+    slice_w_end = -pad_right if pad_right > 0 else None
+    result_tensor = result_tensor[:, :, :num_frames, pad_top:slice_h_end, pad_left:slice_w_end]
+    progress(0.9, desc="Exportando vídeo...")
     output_video_path = tempfile.mktemp(suffix=".mp4")
     video_np = result_tensor[0].permute(1, 2, 3, 0).cpu().float().numpy()
     video_np = np.clip(video_np * 255, 0, 255).astype("uint8")
     export_to_video(video_np, str(output_video_path), fps=24)
     return output_video_path
 # --- UI GRADIO ---
+with gr.Blocks(title="LTX-Video (Final)", theme=gr.themes.Soft()) as demo:
+    gr.HTML("<h1>LTX-Video - Geração de Vídeo Multi-Scale (FP8)</h1>")
     with gr.Row():
         with gr.Column(scale=1):
+            image_in = gr.Image(type="filepath", label="Imagem de Entrada (Opcional)")
+            prompt_in = gr.Textbox(label="Prompt", lines=4, placeholder="Ex: a cinematic shot...")
+            with gr.Accordion("Parâmetros", open=True):
+                height_in = gr.Slider(label="Altura", minimum=256, maximum=1024, step=32, value=480)
+                width_in = gr.Slider(label="Largura", minimum=256, maximum=1280, step=32, value=832)
+                frames_in = gr.Slider(label="Frames", minimum=17, maximum=161, step=8, value=97)
+                seed_in = gr.Number(label="Seed", value=42, precision=0)
             run_button = gr.Button("Gerar Vídeo", variant="primary")
         with gr.Column(scale=1):
             video_out = gr.Video(label="Vídeo Gerado")
     run_button.click(
         fn=generate,
+        inputs=[prompt_in, image_in, height_in, width_in, frames_in, seed_in],
         outputs=[video_out],
     )