File size: 14,985 Bytes
bbec779
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b47d506
bbec779
 
 
 
 
 
 
 
 
 
b47d506
bbec779
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b47d506
 
 
bbec779
 
b47d506
61eb9f4
b47d506
61eb9f4
bbec779
 
 
 
b47d506
 
 
 
bbec779
cec5209
b47d506
 
 
 
bbec779
 
b47d506
 
 
bbec779
 
b47d506
 
 
 
 
 
bbec779
b47d506
 
bbec779
b47d506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbec779
b47d506
 
 
efb0e65
b47d506
 
 
 
 
 
 
 
 
 
 
 
bbec779
 
b47d506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efb0e65
b47d506
 
bbec779
b47d506
 
 
 
 
 
 
 
bbec779
 
b47d506
 
 
 
 
bbec779
b47d506
bbec779
b47d506
bbec779
 
b47d506
 
 
 
bbec779
 
b47d506
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# video_service.py

import torch
import numpy as np
import random
import os
import yaml
from pathlib import Path
import imageio
import tempfile
import sys
import subprocess
import threading
import time
from huggingface_hub import hf_hub_download

# --- LÓGICA DE SETUP E DEPENDÊNCIAS ---

def run_setup():
    setup_script_path = "setup.py"
    if not os.path.exists(setup_script_path):
        print("AVISO: script 'setup.py' não encontrado. Pulando a clonagem de dependências.")
        return
    try:
        print("--- Executando setup.py para garantir que as dependências estão presentes ---")
        subprocess.run([sys.executable, setup_script_path], check=True)
        print("--- Setup concluído com sucesso ---")
    except subprocess.CalledProcessError as e:
        print(f"ERRO CRÍTICO DURANTE O SETUP: 'setup.py' falhou com código {e.returncode}.")
        sys.exit(1)

DEPS_DIR = Path("./deps")
LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
if not LTX_VIDEO_REPO_DIR.exists():
    run_setup()

def add_deps_to_path():
    if not LTX_VIDEO_REPO_DIR.exists():
        raise FileNotFoundError(f"Repositório LTX-Video não encontrado em '{LTX_VIDEO_REPO_DIR}'. Execute o setup.")
    if str(LTX_VIDEO_REPO_DIR.resolve()) not in sys.path:
        sys.path.insert(0, str(LTX_VIDEO_REPO_DIR.resolve()))

add_deps_to_path()

# Importações específicas do modelo
from inference import (
    create_ltx_video_pipeline, create_latent_upsampler,
    load_image_to_tensor_with_resize_and_crop, seed_everething,
    calculate_padding, load_media_file
)
from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem
from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy

# --- CONFIGURAÇÃO DA DISTRIBUIÇÃO DE GPUS ---
GPU_MAPPING = [
    {'base': 'cuda:0', 'upscaler': 'cuda:2'},
    {'base': 'cuda:1', 'upscaler': 'cuda:3'}
]

class VideoService:
    def __init__(self):
        print("Inicializando VideoService (modo Lazy Loading)...")
        self.models_loaded = False
        self.workers = None
        self.config = self._load_config()
        self.models_dir = "downloaded_models"
        self.loading_lock = threading.Lock() # Para evitar que múltiplos usuários iniciem o carregamento ao mesmo tempo

    def _ensure_models_are_loaded(self):
        """Verifica se os modelos estão carregados e os carrega se não estiverem."""
        with self.loading_lock:
            if not self.models_loaded:
                print("Primeira requisição recebida. Iniciando carregamento dos modelos...")
                if torch.cuda.is_available() and torch.cuda.device_count() < 4:
                    raise RuntimeError(f"Este serviço está configurado para 4 GPUs, mas apenas {torch.cuda.device_count()} foram encontradas.")
                
                self._download_model_files()
                self.workers = self._initialize_workers()
                self.models_loaded = True
                print(f"Modelos carregados com sucesso. {len(self.workers)} workers prontos.")

    def _load_config(self):
        config_file_path = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled.yaml"
        with open(config_file_path, "r") as file:
            return yaml.safe_load(file)

    def _download_model_files(self):
        Path(self.models_dir).mkdir(parents=True, exist_ok=True)
        LTX_REPO = "Lightricks/LTX-Video"
        print("Baixando arquivos de modelo (se necessário)...")
        self.distilled_model_path = hf_hub_download(repo_id=LTX_REPO, filename=self.config["checkpoint_path"], local_dir=self.models_dir)
        self.spatial_upscaler_path = hf_hub_download(repo_id=LTX_REPO, filename=self.config["spatial_upscaler_model_path"], local_dir=self.models_dir)
        print("Download de modelos concluído.")

    def _load_models_for_worker(self, base_device, upscaler_device):
        print(f"Carregando modelo base para {base_device} e upscaler para {upscaler_device}")
        pipeline = create_ltx_video_pipeline(
            ckpt_path=self.distilled_model_path, precision=self.config["precision"],
            text_encoder_model_name_or_path=self.config["text_encoder_model_name_or_path"],
            sampler=self.config["sampler"], device="cpu", enhance_prompt=False,
            prompt_enhancer_image_caption_model_name_or_path=self.config["prompt_enhancer_image_caption_model_name_or_path"],
            prompt_enhancer_llm_model_name_or_path=self.config["prompt_enhancer_llm_model_name_or_path"],
        )
        latent_upsampler = create_latent_upsampler(self.spatial_upscaler_path, device="cpu")
        pipeline.to(base_device)
        latent_upsampler.to(upscaler_device)
        return pipeline, latent_upsampler

    def _initialize_workers(self):
        workers = []
        for i, mapping in enumerate(GPU_MAPPING):
            print(f"--- Inicializando Worker {i} ---")
            pipeline, latent_upsampler = self._load_models_for_worker(mapping['base'], mapping['upscaler'])
            workers.append({"id": i, "base_pipeline": pipeline, "latent_upsampler": latent_upsampler, "devices": mapping, "lock": threading.Lock()})
        return workers

    def _acquire_worker(self):
        while True:
            for worker in self.workers:
                if worker["lock"].acquire(blocking=False):
                    print(f"Worker {worker['id']} adquirido para uma nova tarefa.")
                    return worker
            time.sleep(0.1)

    def generate(self, prompt, negative_prompt, input_image_filepath=None, input_video_filepath=None,
                 height=512, width=704, mode="text-to-video", duration=2.0,
                 frames_to_use=9, seed=42, randomize_seed=True, guidance_scale=1.0, # Ignorado, mas mantido por compatibilidade
                 improve_texture=True, progress_callback=None):
        
        # A MÁGICA DO LAZY LOADING ACONTECE AQUI
        self._ensure_models_are_loaded()
        
        worker = self._acquire_worker()
        base_device = worker['devices']['base']
        upscaler_device = worker['devices']['upscaler']
        
        try:
            # ... (todo o resto do código da função generate permanece exatamente o mesmo) ...
            if mode == "image-to-video" and not input_image_filepath: raise ValueError("Caminho da imagem é obrigatório para o modo image-to-video")
            if mode == "video-to-video" and not input_video_filepath: raise ValueError("Caminho do vídeo é obrigatório para o modo video-to-video")

            used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
            seed_everething(used_seed)

            FPS = 24.0; MAX_NUM_FRAMES = 257
            target_frames_rounded = round(duration * FPS)
            n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
            actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
            
            height_padded = ((height - 1) // 32 + 1) * 32
            width_padded = ((width - 1) // 32 + 1) * 32
            padding_values = calculate_padding(height, width, height_padded, width_padded)
            pad_left, pad_right, pad_top, pad_bottom = padding_values
            
            call_kwargs_base = {
                "prompt": prompt, "negative_prompt": negative_prompt, "num_frames": actual_num_frames, "frame_rate": int(FPS),
                "decode_timestep": 0.05, "decode_noise_scale": self.config["decode_noise_scale"],
                "stochastic_sampling": self.config["stochastic_sampling"], "image_cond_noise_scale": 0.025,
                "is_video": True, "vae_per_channel_normalize": True, "mixed_precision": (self.config["precision"] == "mixed_precision"),
                "offload_to_cpu": False, "enhance_prompt": False, "skip_layer_strategy": SkipLayerStrategy.AttentionValues
            }

            result_tensor = None
            if improve_texture:
                downscale_factor = self.config.get("downscale_factor", 0.5)
                downscaled_height_ideal = int(height_padded * downscale_factor); downscaled_width_ideal = int(width_padded * downscale_factor)
                downscaled_height = ((downscaled_height_ideal - 1) // 32 + 1) * 32; downscaled_width = ((downscaled_width_ideal - 1) // 32 + 1) * 32

                # --- PASSE 1 ---
                first_pass_kwargs = call_kwargs_base.copy()
                first_pass_kwargs.update({
                    "height": downscaled_height, "width": downscaled_width,
                    "generator": torch.Generator(device=base_device).manual_seed(used_seed),
                    "output_type": "latent", "guidance_scale": 1.0,
                    "timesteps": self.config["first_pass"]["timesteps"],
                    "stg_scale": self.config["first_pass"]["stg_scale"],
                    "rescaling_scale": self.config["first_pass"]["rescaling_scale"],
                    "skip_block_list": self.config["first_pass"]["skip_block_list"]
                })

                if mode == "image-to-video":
                    padding_low_res = calculate_padding(downscaled_height, downscaled_width, downscaled_height, downscaled_width)
                    media_tensor_low_res = load_image_to_tensor_with_resize_and_crop(input_image_filepath, downscaled_height, downscaled_width)
                    media_tensor_low_res = torch.nn.functional.pad(media_tensor_low_res, padding_low_res)
                    first_pass_kwargs["conditioning_items"] = [ConditioningItem(media_tensor_low_res.to(base_device), 0, 1.0)]

                print(f"Worker {worker['id']}: Iniciando passe 1 em {base_device}")
                with torch.no_grad(): low_res_latents = worker['base_pipeline'](**first_pass_kwargs).images
                
                low_res_latents = low_res_latents.to(upscaler_device)
                with torch.no_grad(): high_res_latents = worker['latent_upsampler'](low_res_latents)
                high_res_latents = high_res_latents.to(base_device)
                
                # --- PASSE 2 ---
                second_pass_kwargs = call_kwargs_base.copy()
                high_res_h, high_res_w = downscaled_height * 2, downscaled_width * 2
                second_pass_kwargs.update({
                    "height": high_res_h, "width": high_res_w, "latents": high_res_latents,
                    "generator": torch.Generator(device=base_device).manual_seed(used_seed),
                    "output_type": "pt", "image_cond_noise_scale": 0.0, "guidance_scale": 1.0,
                    "timesteps": self.config["second_pass"]["timesteps"],
                    "stg_scale": self.config["second_pass"]["stg_scale"],
                    "rescaling_scale": self.config["second_pass"]["rescaling_scale"],
                    "skip_block_list": self.config["second_pass"]["skip_block_list"],
                    "tone_map_compression_ratio": self.config["second_pass"].get("tone_map_compression_ratio", 0.0)
                })
                
                if mode == "image-to-video":
                    padding_high_res = calculate_padding(high_res_h, high_res_w, high_res_h, high_res_w)
                    media_tensor_high_res = load_image_to_tensor_with_resize_and_crop(input_image_filepath, high_res_h, high_res_w)
                    media_tensor_high_res = torch.nn.functional.pad(media_tensor_high_res, padding_high_res)
                    second_pass_kwargs["conditioning_items"] = [ConditioningItem(media_tensor_high_res.to(base_device), 0, 1.0)]

                print(f"Worker {worker['id']}: Iniciando passe 2 em {base_device}")
                with torch.no_grad(): result_tensor = worker['base_pipeline'](**second_pass_kwargs).images

            else: # Passe Único
                single_pass_kwargs = call_kwargs_base.copy()
                first_pass_config = self.config["first_pass"]
                single_pass_kwargs.update({
                    "height": height_padded, "width": width_padded, "output_type": "pt",
                    "generator": torch.Generator(device=base_device).manual_seed(used_seed),
                    "guidance_scale": 1.0, **first_pass_config
                })
                if mode == "image-to-video":
                    media_tensor_final = load_image_to_tensor_with_resize_and_crop(input_image_filepath, height_padded, width_padded)
                    media_tensor_final = torch.nn.functional.pad(media_tensor_final, padding_values)
                    single_pass_kwargs["conditioning_items"] = [ConditioningItem(media_tensor_final.to(base_device), 0, 1.0)]
                elif mode == "video-to-video":
                    single_pass_kwargs["media_items"] = load_media_file(media_path=input_video_filepath, height=height_padded, width=width_padded, max_frames=int(frames_to_use), padding=padding_values).to(base_device)
                
                print(f"Worker {worker['id']}: Iniciando passe único em {base_device}")
                with torch.no_grad(): result_tensor = worker['base_pipeline'](**single_pass_kwargs).images
            
            if result_tensor.shape[-2:] != (height, width):
                num_frames_final = result_tensor.shape[2]
                videos_tensor = result_tensor.permute(0, 2, 1, 3, 4).reshape(-1, result_tensor.shape[1], result_tensor.shape[3], result_tensor.shape[4])
                videos_resized = torch.nn.functional.interpolate(videos_tensor, size=(height, width), mode='bilinear', align_corners=False)
                result_tensor = videos_resized.reshape(result_tensor.shape[0], num_frames_final, result_tensor.shape[1], height, width).permute(0, 2, 1, 3, 4)

            result_tensor = result_tensor[:, :, :actual_num_frames, (pad_top if pad_top > 0 else None):(-pad_bottom if pad_bottom > 0 else None), (pad_left if pad_left > 0 else None):(-pad_right if pad_right > 0 else None)]
            video_np = (result_tensor[0].permute(1, 2, 3, 0).cpu().float().numpy() * 255).astype(np.uint8)
            temp_dir = tempfile.mkdtemp()
            output_video_path = os.path.join(temp_dir, f"output_{used_seed}.mp4")

            with imageio.get_writer(output_video_path, fps=call_kwargs_base["frame_rate"], codec='libx264', quality=8) as writer:
                for i, frame in enumerate(video_np):
                    writer.append_data(frame)
                    if progress_callback: progress_callback(i + 1, len(video_np))
            return output_video_path, used_seed

        except Exception as e:
            print(f"!!!!!!!! ERRO no Worker {worker['id']} !!!!!!!!\n{e}")
            raise e
        finally:
            print(f"Worker {worker['id']}: Tarefa finalizada. Limpando cache e liberando worker...")
            with torch.cuda.device(base_device): torch.cuda.empty_cache()
            with torch.cuda.device(upscaler_device): torch.cuda.empty_cache()
            worker["lock"].release()

# A instância do serviço é criada aqui, mas os modelos só serão carregados no primeiro clique.
video_generation_service = VideoService()