euIaxs22 commited on
Commit
318739d
·
verified ·
1 Parent(s): 70c3e5d

Update app_ltx.py

Browse files
Files changed (1) hide show
  1. app_ltx.py +72 -133
app_ltx.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import torch
3
-
4
  import numpy as np
5
  import random
6
  import os
@@ -14,42 +14,27 @@ import shutil
14
  import sys
15
 
16
  # --- SETUP INICIAL: GARANTIR QUE A BIBLIOTECA LTX-VIDEO ESTEJA ACESSÍVEL ---
17
-
18
- # O Dockerfile deve clonar o repositório para /opt/LTX-Video
19
- LTX_REPO_PATH = Path("/data/LTX-Video")
20
- if not LTX_REPO_PATH.exists():
21
- # Fallback se o Dockerfile não clonou, tenta clonar agora.
22
- print(f"Diretório {LTX_REPO_PATH} não encontrado. Tentando clonar...")
23
- try:
24
- subprocess.run(["git", "clone", "--depth", "1", "https://github.com/Lightricks/LTX-Video", str(LTX_REPO_PATH)], check=True)
25
- except Exception as e:
26
- print(f"ERRO FATAL: Falha ao clonar o repositório LTX-Video. {e}")
27
- raise
28
-
29
- if str(LTX_REPO_PATH) not in sys.path:
30
- # Adiciona o diretório clonado ao sys.path para permitir os imports
31
  sys.path.insert(0, str(LTX_REPO_PATH))
32
  print(f"Adicionado '{LTX_REPO_PATH}' ao sys.path.")
33
 
34
- # Agora, importa as funções e classes do repositório LTX-Video
 
35
  try:
36
- from ltx_video.inference import (
37
- create_ltx_video_pipeline,
38
- create_latent_upsampler,
39
- seed_everething,
40
- calculate_padding,
41
- )
42
- from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
43
  from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
44
- from diffusers.utils import export_to_video, load_image, load_video
45
- from ltx_video.pipelines.pipeline_ltx_condition import LTXVideoCondition
46
  except ImportError as e:
47
- print(f"ERRO FATAL: Falha ao importar módulos do LTX-Video. Verifique a instalação do repositório. Erro: {e}")
48
  raise
 
49
 
50
 
51
  # --- CARREGAMENTO GLOBAL DOS MODELOS E CONFIGURAÇÕES ---
52
-
53
  APP_HOME = Path(os.environ.get("APP_HOME", "/app"))
54
  CONFIG_FILE_PATH = APP_HOME / "configs" / "ltxv-13b-0.9.8-distilled-fp8.yaml"
55
  MODELS_DIR = Path("/data/ltx_models_official")
@@ -62,153 +47,107 @@ with open(CONFIG_FILE_PATH, "r") as file:
62
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
63
  DTYPE = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else torch.float16
64
 
65
- # --- Baixa os modelos necessários (idempotente) ---
66
- print(f"Verificando e baixando arquivos de modelo para '{MODELS_DIR}'...")
67
- # 1. Baixa os arquivos de pesos principais
68
  for key in ["checkpoint_path", "spatial_upscaler_model_path"]:
69
  filename = PIPELINE_CONFIG_YAML.get(key)
70
  if filename and not (MODELS_DIR / filename).exists():
71
- print(f"Baixando {filename}...")
72
  hf_hub_download(repo_id="Lightricks/LTX-Video", filename=filename, local_dir=str(MODELS_DIR), token=os.getenv("HF_TOKEN"))
73
- # 2. Baixa os componentes de apoio (VAE, Text Encoder, etc.)
74
- snapshot_download(repo_id="Lightricks/LTX-Video", local_dir=str(MODELS_DIR),
75
- allow_patterns=["text_encoder/*", "tokenizer/*", "vae/*", "scheduler/*"],
76
- token=os.getenv("HF_TOKEN"))
77
- print("Arquivos de modelo verificados/baixados.")
78
-
79
- # --- Monta as Pipelines (uma única vez, mantendo-as "quentes") ---
80
- print("Montando pipelines LTX-Video na memória...")
81
- # Modifica a config em memória para usar o caminho local dos componentes
82
- PIPELINE_CONFIG_YAML["text_encoder_model_name_or_path"] = str(MODELS_DIR)
83
-
84
- pipeline_instance = create_ltx_video_pipeline(
85
- ckpt_path=str(MODELS_DIR / PIPELINE_CONFIG_YAML["checkpoint_path"]),
86
- precision=PIPELINE_CONFIG_YAML["precision"],
87
- text_encoder_model_name_or_path=PIPELINE_CONFIG_YAML["text_encoder_model_name_or_path"],
88
- sampler=PIPELINE_CONFIG_YAML["sampler"],
89
- device="cpu", # Carrega na CPU primeiro para economizar VRAM durante a inicialização
90
- )
91
- latent_upsampler_instance = create_latent_upsampler(
92
- latent_upsampler_model_path=str(MODELS_DIR / PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"]),
93
- device="cpu"
94
  )
95
-
96
- print(f"Movendo pipelines para o dispositivo: {DEVICE}...")
97
- pipeline_instance.to(DEVICE)
98
- latent_upsampler_instance.to(DEVICE)
99
  pipeline_instance.vae.enable_tiling()
100
- print("✅ Pipelines montadas e prontas na GPU.")
101
-
102
-
103
- # --- FUNÇÃO DE GERAÇÃO PRINCIPAL (CALLBACK DO GRADIO) ---
104
-
105
- def round_to_nearest_resolution(height, width):
106
- ratio = pipeline_instance.vae.spatial_compression_ratio
107
- height = height - (height % ratio)
108
- width = width - (width % ratio)
109
- return int(height), int(width)
110
 
111
 
 
 
112
  def generate(
113
- prompt: str,
114
- image_input: Optional[str],
115
- target_height: int,
116
- target_width: int,
117
- num_frames: int,
118
- seed: int,
119
- guidance_scale: float,
120
- num_inference_steps: int,
121
- denoise_strength: float,
122
  progress=gr.Progress(track_tqdm=True)
123
  ):
124
- if not image_input and not prompt:
125
- raise gr.Error("Por favor, forneça uma imagem de entrada ou um prompt de texto.")
126
-
127
  seed_everething(seed)
128
  generator = torch.Generator(device=DEVICE).manual_seed(seed)
129
 
130
- conditions = None
 
 
 
 
131
  if image_input:
132
  progress(0.1, desc="Preparando imagem de condição...")
133
- image = load_image(image_input)
134
- video_condition_input = load_video(export_to_video([image]))
135
- condition = ConditioningItem(video_condition_input.to(DEVICE), 0, 1.0)
136
- conditions = [condition]
 
137
 
138
- # --- LÓGICA MULTI-ESCALA ---
139
  multi_scale_pipeline = LTXMultiScalePipeline(pipeline_instance, latent_upsampler_instance)
140
-
141
- # Prepara os argumentos com base no YAML e na UI
142
- first_pass_args = PIPELINE_CONFIG_YAML.get("first_pass", {}).copy()
143
- second_pass_args = PIPELINE_CONFIG_YAML.get("second_pass", {}).copy()
144
-
145
- # Sobrescreve com os valores da UI onde faz sentido
146
- # Se o YAML tiver uma lista para guidance_scale, respeitamos isso. Se não, usamos o valor da UI.
147
- if not isinstance(first_pass_args.get("guidance_scale"), list):
148
- first_pass_args["guidance_scale"] = guidance_scale
149
- if not isinstance(second_pass_args.get("guidance_scale"), list):
150
- second_pass_args["guidance_scale"] = guidance_scale
151
-
152
- first_pass_args["num_inference_steps"] = num_inference_steps
153
- second_pass_args["denoise_strength"] = denoise_strength
154
 
155
  call_kwargs = {
156
- "prompt": prompt,
157
- "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
158
  "height": target_height, "width": target_width, "num_frames": num_frames,
159
  "generator": generator, "output_type": "pt",
160
- "conditioning_items": conditions,
161
- "decode_timestep": PIPELINE_CONFIG_YAML["decode_timestep"],
162
- "decode_noise_scale": PIPELINE_CONFIG_YAML["decode_noise_scale"],
163
- "downscale_factor": PIPELINE_CONFIG_YAML["downscale_factor"],
164
- "first_pass": first_pass_args,
165
- "second_pass": second_pass_args,
166
  }
167
-
168
- print("[LTX App] Executando pipeline multi-escala...")
169
- progress(0.3, desc="Gerando vídeo (pode levar alguns minutos)...")
170
  result_tensor = multi_scale_pipeline(**call_kwargs).images
171
 
172
- # --- ETAPA FINAL: Exportar para vídeo ---
173
- progress(0.9, desc="Exportando para arquivo de vídeo...")
 
 
 
 
174
  output_video_path = tempfile.mktemp(suffix=".mp4")
175
  video_np = result_tensor[0].permute(1, 2, 3, 0).cpu().float().numpy()
176
  video_np = np.clip(video_np * 255, 0, 255).astype("uint8")
177
  export_to_video(video_np, str(output_video_path), fps=24)
178
 
179
- print(f"Vídeo gerado com sucesso em: {output_video_path}")
180
  return output_video_path
181
 
182
  # --- UI GRADIO ---
183
- with gr.Blocks(title="LTX-Video (Correto)", theme=gr.themes.Soft()) as demo:
184
- gr.HTML("<h1>LTX-Video - Geração de Vídeo Multi-Scale (FP8)</h1><p>Implementação final usando a API nativa do LTX-Video.</p>")
185
-
186
  with gr.Row():
187
  with gr.Column(scale=1):
188
- image_in = gr.Image(type="filepath", label="Imagem de Entrada (Opcional para txt2vid)")
189
- prompt_in = gr.Textbox(label="Prompt", lines=4, placeholder="Ex: a cinematic shot of a majestic lion walking in the savanna, 4k, high quality")
190
-
191
- with gr.Accordion("Parâmetros Principais", open=True):
192
- with gr.Row():
193
- height_in = gr.Slider(label="Altura Final (Height)", minimum=256, maximum=1024, step=32, value=480)
194
- width_in = gr.Slider(label="Largura Final (Width)", minimum=256, maximum=1280, step=32, value=832)
195
- with gr.Row():
196
- frames_in = gr.Slider(label="Número de Frames", minimum=17, maximum=161, step=8, value=97, info="Deve ser um múltiplo de 8 + 1.")
197
- seed_in = gr.Number(label="Seed", value=42, precision=0)
198
-
199
- with gr.Accordion("Parâmetros Avançados", open=False):
200
- num_inference_steps_in = gr.Slider(label="Passos de Inferência (Etapa 1)", minimum=4, maximum=50, step=1, value=30)
201
- guidance_scale_in = gr.Slider(label="Força do Guia (Guidance)", minimum=1.0, maximum=10.0, step=0.5, value=1.0, info="Para modelos 'distilled', o valor recomendado é 1.0.")
202
- denoise_strength_in = gr.Slider(label="Força do Refinamento (Denoise)", minimum=0.1, maximum=1.0, step=0.05, value=0.5, info="Controla a intensidade da Etapa 3 (refinamento).")
203
-
204
  run_button = gr.Button("Gerar Vídeo", variant="primary")
205
-
206
  with gr.Column(scale=1):
207
  video_out = gr.Video(label="Vídeo Gerado")
208
 
209
  run_button.click(
210
  fn=generate,
211
- inputs=[prompt_in, image_in, height_in, width_in, frames_in, seed_in, guidance_scale_in, num_inference_steps_in, denoise_strength_in],
212
  outputs=[video_out],
213
  )
214
 
 
1
  import gradio as gr
2
  import torch
3
+ import spaces
4
  import numpy as np
5
  import random
6
  import os
 
14
  import sys
15
 
16
  # --- SETUP INICIAL: GARANTIR QUE A BIBLIOTECA LTX-VIDEO ESTEJA ACESSÍVEL ---
17
+ # O Dockerfile deve ter clonado e instalado o repositório em /opt/LTX-Video
18
+ LTX_REPO_PATH = Path("/opt/LTX-Video")
19
+ if LTX_REPO_PATH.exists() and str(LTX_REPO_PATH) not in sys.path:
 
 
 
 
 
 
 
 
 
 
 
20
  sys.path.insert(0, str(LTX_REPO_PATH))
21
  print(f"Adicionado '{LTX_REPO_PATH}' ao sys.path.")
22
 
23
+ # ====================================================================
24
+ # <<< IMPORTAÇÕES CORRIGIDAS, EXATAMENTE COMO VOCÊ PEDIU >>>
25
  try:
26
+ from ltx_video.pipelines.pipeline_ltx_video import LTXVideoPipeline, ConditioningItem, LTXMultiScalePipeline
27
+ from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler
 
 
 
 
 
28
  from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
29
+ from diffusers.utils import export_to_video, load_image
30
+ from ltx_video.inference import seed_everething, calculate_padding, load_media_file
31
  except ImportError as e:
32
+ print(f"ERRO FATAL: Falha ao importar módulos do LTX-Video. Verifique a instalação. Erro: {e}")
33
  raise
34
+ # ====================================================================
35
 
36
 
37
  # --- CARREGAMENTO GLOBAL DOS MODELOS E CONFIGURAÇÕES ---
 
38
  APP_HOME = Path(os.environ.get("APP_HOME", "/app"))
39
  CONFIG_FILE_PATH = APP_HOME / "configs" / "ltxv-13b-0.9.8-distilled-fp8.yaml"
40
  MODELS_DIR = Path("/data/ltx_models_official")
 
47
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
48
  DTYPE = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else torch.float16
49
 
50
+ print(f"Verificando e baixando modelos para '{MODELS_DIR}'...")
51
+ # Baixa os arquivos de pesos principais
 
52
  for key in ["checkpoint_path", "spatial_upscaler_model_path"]:
53
  filename = PIPELINE_CONFIG_YAML.get(key)
54
  if filename and not (MODELS_DIR / filename).exists():
 
55
  hf_hub_download(repo_id="Lightricks/LTX-Video", filename=filename, local_dir=str(MODELS_DIR), token=os.getenv("HF_TOKEN"))
56
+ # Baixa os componentes de apoio
57
+ snapshot_download(repo_id=PIPELINE_CONFIG_YAML["text_encoder_model_name_or_path"], local_dir=str(MODELS_DIR / "text_encoder"), token=os.getenv("HF_TOKEN"))
58
+ print("Modelos verificados/baixados.")
59
+
60
+
61
+ print("Montando pipelines LTX-Video...")
62
+ # Carrega os componentes individualmente para montar a pipeline
63
+ from transformers import T5EncoderModel, T5Tokenizer
64
+ from ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
65
+ from ltx_video.models.transformers.transformer3d import Transformer3DModel
66
+ from ltx_video.schedulers.rf import RectifiedFlowScheduler
67
+ from ltx_video.models.transformers.symmetric_patchifier import SymmetricPatchifier
68
+
69
+ transformer = Transformer3DModel.from_pretrained(str(MODELS_DIR / PIPELINE_CONFIG_YAML["checkpoint_path"])).to(DEVICE, dtype=DTYPE)
70
+ vae = CausalVideoAutoencoder.from_pretrained(str(MODELS_DIR / "vae")).to(DEVICE, dtype=DTYPE)
71
+ text_encoder = T5EncoderModel.from_pretrained(str(MODELS_DIR / "text_encoder")).to(DEVICE, dtype=DTYPE)
72
+ tokenizer = T5Tokenizer.from_pretrained(str(MODELS_DIR / "text_encoder"))
73
+ scheduler = RectifiedFlowScheduler.from_pretrained(str(MODELS_DIR / "scheduler"))
74
+
75
+ pipeline_instance = LTXVideoPipeline(
76
+ vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, transformer=transformer, scheduler=scheduler, patchifier=SymmetricPatchifier(patch_size=1)
77
  )
78
+ latent_upsampler_instance = LatentUpsampler.from_pretrained(str(MODELS_DIR / PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"])).to(DEVICE, dtype=DTYPE)
 
 
 
79
  pipeline_instance.vae.enable_tiling()
80
+ print("✅ Pipelines prontas na GPU.")
 
 
 
 
 
 
 
 
 
81
 
82
 
83
+ # --- FUNÇÃO DE GERAÇÃO PRINCIPAL ---
84
+ @spaces.GPU
85
  def generate(
86
+ prompt: str, image_input: Optional[str],
87
+ target_height: int, target_width: int, num_frames: int, seed: int,
 
 
 
 
 
 
 
88
  progress=gr.Progress(track_tqdm=True)
89
  ):
 
 
 
90
  seed_everething(seed)
91
  generator = torch.Generator(device=DEVICE).manual_seed(seed)
92
 
93
+ height_padded = ((target_height - 1) // 32 + 1) * 32
94
+ width_padded = ((target_width - 1) // 32 + 1) * 32
95
+ padding_values = calculate_padding(target_height, target_width, height_padded, width_padded)
96
+
97
+ conditioning_items = None
98
  if image_input:
99
  progress(0.1, desc="Preparando imagem de condição...")
100
+ media_tensor = load_media_file(
101
+ media_path=image_input, height=target_height, width=target_width,
102
+ max_frames=1, padding=padding_values, just_crop=True
103
+ )
104
+ conditioning_items = [ConditioningItem(media_tensor.to(DEVICE, dtype=DTYPE), 0, 1.0)]
105
 
 
106
  multi_scale_pipeline = LTXMultiScalePipeline(pipeline_instance, latent_upsampler_instance)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  call_kwargs = {
109
+ "prompt": prompt, "negative_prompt": "worst quality...",
 
110
  "height": target_height, "width": target_width, "num_frames": num_frames,
111
  "generator": generator, "output_type": "pt",
112
+ "conditioning_items": conditioning_items,
113
+ **PIPELINE_CONFIG_YAML
 
 
 
 
114
  }
115
+
116
+ progress(0.3, desc="Gerando vídeo...")
 
117
  result_tensor = multi_scale_pipeline(**call_kwargs).images
118
 
119
+ pad_left, pad_right, pad_top, pad_bottom = padding_values
120
+ slice_h_end = -pad_bottom if pad_bottom > 0 else None
121
+ slice_w_end = -pad_right if pad_right > 0 else None
122
+ result_tensor = result_tensor[:, :, :num_frames, pad_top:slice_h_end, pad_left:slice_w_end]
123
+
124
+ progress(0.9, desc="Exportando vídeo...")
125
  output_video_path = tempfile.mktemp(suffix=".mp4")
126
  video_np = result_tensor[0].permute(1, 2, 3, 0).cpu().float().numpy()
127
  video_np = np.clip(video_np * 255, 0, 255).astype("uint8")
128
  export_to_video(video_np, str(output_video_path), fps=24)
129
 
 
130
  return output_video_path
131
 
132
  # --- UI GRADIO ---
133
+ with gr.Blocks(title="LTX-Video (Final)", theme=gr.themes.Soft()) as demo:
134
+ gr.HTML("<h1>LTX-Video - Geração de Vídeo Multi-Scale (FP8)</h1>")
 
135
  with gr.Row():
136
  with gr.Column(scale=1):
137
+ image_in = gr.Image(type="filepath", label="Imagem de Entrada (Opcional)")
138
+ prompt_in = gr.Textbox(label="Prompt", lines=4, placeholder="Ex: a cinematic shot...")
139
+ with gr.Accordion("Parâmetros", open=True):
140
+ height_in = gr.Slider(label="Altura", minimum=256, maximum=1024, step=32, value=480)
141
+ width_in = gr.Slider(label="Largura", minimum=256, maximum=1280, step=32, value=832)
142
+ frames_in = gr.Slider(label="Frames", minimum=17, maximum=161, step=8, value=97)
143
+ seed_in = gr.Number(label="Seed", value=42, precision=0)
 
 
 
 
 
 
 
 
 
144
  run_button = gr.Button("Gerar Vídeo", variant="primary")
 
145
  with gr.Column(scale=1):
146
  video_out = gr.Video(label="Vídeo Gerado")
147
 
148
  run_button.click(
149
  fn=generate,
150
+ inputs=[prompt_in, image_in, height_in, width_in, frames_in, seed_in],
151
  outputs=[video_out],
152
  )
153