import os, random, tempfile import gradio as gr import torch import numpy as np from PIL import Image from diffusers import LTXConditionPipeline from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition from diffusers.utils import export_to_video # Modelo LTX-Video via Diffusers MODEL_REPO = os.getenv("LTX_REPO", "Lightricks/LTX-Video") # Parâmetros básicos FPS = 24 MAX_FRAMES = 161 MIN_DIM = 256 MAX_DIM = 1280 device = "cuda" if torch.cuda.is_available() else "cpu" # Helpers de tipo def _to_int(x, d): if isinstance(x, (list, tuple)): x = x[0] if x else d try: return int(x) except Exception: return d def _to_float(x, d): if isinstance(x, (list, tuple)): x = x[0] if x else d try: return float(x) except Exception: return d def _to_bool(x, d=True): if isinstance(x, (list, tuple)): x = x[0] if x else d return bool(x) # Frames alinhados a (8k+1) def _frames_from_secs(secs): secs = _to_float(secs, 2.0) n = max(9, int(round(secs * FPS))) k = round((n - 1) / 8.0) return int(max(9, min(MAX_FRAMES, k * 8 + 1))) def _pad32(v): return ((v - 1) // 32 + 1) * 32 def _dims_for_image(path, target=768): im = Image.open(path) w, h = im.size if w >= h: new_h = target new_w = int(round((w / max(1, h)) * new_h / 32) * 32) else: new_w = target new_h = int(round((h / max(1, w)) * new_w / 32) * 32) new_h = max(MIN_DIM, min(new_h, MAX_DIM)) new_w = max(MIN_DIM, min(new_w, MAX_DIM)) return new_h, new_w print(f"Carregando {MODEL_REPO} (LTXConditionPipeline)...") pipe = LTXConditionPipeline.from_pretrained( MODEL_REPO, torch_dtype=torch.bfloat16, # simples e estável; FP8 opcional pode ser adicionado depois ) pipe.to(device) # Desliga dynamic shifting no scheduler para não exigir 'mu' if hasattr(pipe, "scheduler") and hasattr(pipe.scheduler, "use_dynamic_shifting"): pipe.scheduler.use_dynamic_shifting = False # Tiling do VAE para reduzir picos de VRAM if hasattr(pipe, "vae") and hasattr(pipe.vae, "enable_tiling"): pipe.vae.enable_tiling() def handle_dims(image_path, cur_h, cur_w): if not image_path: return gr.update(value=cur_h), gr.update(value=cur_w) try: h, w = _dims_for_image(image_path, 768) return gr.update(value=h), gr.update(value=w) except Exception as e: print(f"Erro ao ajustar dimensões: {e}") return gr.update(value=cur_h), gr.update(value=cur_w) def generate_i2v( prompt, neg_prompt, image_path, height_ui, width_ui, duration_ui, seed_ui, randomize_seed, guidance_ui, denoise_ui, image_noise_ui, progress=gr.Progress(track_tqdm=True) ): if not image_path: raise gr.Error("Selecione uma imagem.") h = _to_int(height_ui, 512) w = _to_int(width_ui, 704) h_pad = _pad32(h) w_pad = _pad32(w) num_frames = _frames_from_secs(duration_ui) # Parâmetros de qualidade estáveis guidance_scale = _to_float(guidance_ui, 5.0) # 4.0–6.0 funcionam bem denoise_strength = _to_float(denoise_ui, 0.4) # 0.3–0.5 preserva bem a imagem image_cond_noise_scale = _to_float(image_noise_ui, 0.0) # 0.0 fixa a aparência; 0.01–0.03 relaxa seed = _to_int(seed_ui, 42) if _to_bool(randomize_seed, True): seed = random.randint(0, 2**32 - 1) # Condição: imagem como primeiro frame img = Image.open(image_path).convert("RGB") cond = LTXVideoCondition(image=img, frame_index=0, strength=1.0) gen = torch.Generator(device=device).manual_seed(seed) progress(0.0, desc="Gerando vídeo...") out = pipe( conditions=[cond], prompt=prompt, negative_prompt=neg_prompt, width=w_pad, height=h_pad, num_frames=num_frames, num_inference_steps=30, # simples e estável #guidance_scale=guidance_scale, #guidance_rescale=0.7, # ajuda a estabilizar CFG #decode_timestep=0.05, # valores seguros para >=0.9.1 #decode_noise_scale=0.025, #image_cond_noise_scale=image_cond_noise_scale, #denoise_strength=denoise_strength, generator=gen, output_type="pil", ) frames = out.frames[0] tmp = tempfile.mkdtemp() out_path = os.path.join(tmp, f"output_{random.randint(10000,99999)}.mp4") progress(0.8, desc="Salvando vídeo") export_to_video(frames, out_path, fps=FPS) return out_path, int(seed) # UI simples with gr.Blocks() as demo: gr.Markdown("LTX I2V (Diffusers) simples com denoise e dynamic shifting desligado") with gr.Row(): with gr.Column(): img = gr.Image(label="Imagem", type="filepath") prompt = gr.Textbox(label="Prompt", value="Subject moves gently; subtle camera push-in", lines=2) neg = gr.Textbox(label="Negative", value="worst quality, jitter, blur, distortions", lines=2) dur = gr.Slider(label="Duração (s)", minimum=0.5, maximum=8.0, step=0.1, value=2.0) with gr.Row(): h = gr.Slider(label="Altura", minimum=MIN_DIM, maximum=MAX_DIM, step=32, value=512) w = gr.Slider(label="Largura", minimum=MIN_DIM, maximum=MAX_DIM, step=32, value=704) with gr.Accordion("Avançado", open=False): seed = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1) rand = gr.Checkbox(label="Randomize seed", value=True) guidance = gr.Slider(label="Guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=5.0) denoise = gr.Slider(label="Denoise strength", minimum=0.0, maximum=1.0, step=0.05, value=0.4) image_noise = gr.Slider(label="Image cond noise", minimum=0.0, maximum=0.2, step=0.005, value=0.0) btn = gr.Button("Gerar", variant="primary") with gr.Column(): vid = gr.Video(label="Vídeo") img.upload(handle_dims, [img, h, w], [h, w]) btn.click( generate_i2v, [prompt, neg, img, h, w, dur, seed, rand, guidance, denoise, image_noise], [vid, seed] ) if __name__ == "__main__": # Para integração MCP: instale gradio[mcp] e adicione mcp_server=True se necessário. demo.queue().launch(debug=True, share=False)