EuuIia commited on
Commit
b47d506
·
verified ·
1 Parent(s): 18b2069

Update video_service.py

Browse files
Files changed (1) hide show
  1. video_service.py +110 -133
video_service.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  # video_service.py
3
 
4
  import torch
@@ -30,7 +29,7 @@ def run_setup():
30
  print(f"ERRO CRÍTICO DURANTE O SETUP: 'setup.py' falhou com código {e.returncode}.")
31
  sys.exit(1)
32
 
33
- DEPS_DIR = Path("/data")
34
  LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
35
  if not LTX_VIDEO_REPO_DIR.exists():
36
  run_setup()
@@ -41,7 +40,7 @@ def add_deps_to_path():
41
  if str(LTX_VIDEO_REPO_DIR.resolve()) not in sys.path:
42
  sys.path.insert(0, str(LTX_VIDEO_REPO_DIR.resolve()))
43
 
44
- #add_deps_to_path()
45
 
46
  # Importações específicas do modelo
47
  from inference import (
@@ -123,163 +122,141 @@ class VideoService:
123
  return worker
124
  time.sleep(0.1)
125
 
126
-
127
- def generate(self, prompt, negative_prompt, input_image_filepath=None, input_video_filepath=None, height=512, width=704, mode="text-to-video", duration=2.0, frames_to_use=9, seed=42, randomize_seed=True, guidance_scale=1.0, # Agora usado corretamente
 
128
  improve_texture=True, progress_callback=None):
129
 
 
130
  self._ensure_models_are_loaded()
 
131
  worker = self._acquire_worker()
132
  base_device = worker['devices']['base']
133
  upscaler_device = worker['devices']['upscaler']
134
 
135
  try:
136
- # Validações alinhadas com app-20.py
137
- if mode == "image-to-video" and not input_image_filepath:
138
- raise ValueError("Caminho da imagem obrigatório para o modo image-to-video")
139
- if mode == "video-to-video" and not input_video_filepath:
140
- raise ValueError("Caminho do vídeo obrigatório para o modo video-to-video")
141
-
142
  used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
143
  seed_everething(used_seed)
144
-
145
- FPS = 30.0 # Alinhado com app-20.py
146
- MAX_NUM_FRAMES = 257
147
- target_frames_ideal = duration * FPS
148
- target_frames_rounded = round(target_frames_ideal)
149
- if target_frames_rounded < 1: target_frames_rounded = 1
150
- n_val = round(float(target_frames_rounded - 1.0) / 8.0)
151
  actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
152
 
153
- actual_height = int(height)
154
- actual_width = int(width)
155
- height_padded = (actual_height - 1) // 32 * 32 + 32
156
- width_padded = (actual_width - 1) // 32 * 32 + 32
157
- num_frames_padded = (actual_num_frames - 2) // 8 * 8 + 1 # Alinhamento exato com app-20.py
158
- if num_frames_padded != actual_num_frames:
159
- print(f"Warning: actual_num_frames {actual_num_frames} and num_frames_padded {num_frames_padded} differ. Using num_frames_padded for pipeline.")
160
-
161
- padding_values = calculate_padding(actual_height, actual_width, height_padded, width_padded)
162
  pad_left, pad_right, pad_top, pad_bottom = padding_values
163
 
164
- # Kwargs base alinhados
165
- call_kwargs = {
166
- "prompt": prompt,
167
- "negative_prompt": negative_prompt,
168
- "height": height_padded,
169
- "width": width_padded,
170
- "num_frames": num_frames_padded,
171
- "framerate": int(FPS),
172
- "generator": torch.Generator(device=base_device).manual_seed(used_seed),
173
- "output_type": "pt",
174
- "conditioning_items": None,
175
- "media_items": None,
176
- "decode_timestep": self.config['decode_timestep'],
177
- "decode_noise_scale": self.config['decode_noise_scale'],
178
- "stochastic_sampling": self.config['stochastic_sampling'],
179
- "image_cond_noise_scale": 0.15, # Alinhado
180
- "is_video": True,
181
- "vae_per_channel_normalize": True,
182
- "mixed_precision": self.config['precision'] + " mixed_precision",
183
- "offload_to_cpu": False,
184
- "enhance_prompt": False,
185
  }
186
-
187
- # Estratégia de skip layer alinhada
188
- stg_mode_str = self.config.get('stg_mode', 'attention_values')
189
- if stg_mode_str.lower() in ['stgav', 'attentionvalues']:
190
- call_kwargs['skip_layer_strategy'] = SkipLayerStrategy.AttentionValues
191
- # ... (adicionar outros elif como no app-20.py)
192
-
193
- # Conditioning para modos
194
- if mode == "image-to-video" and input_image_filepath:
195
- media_tensor = load_image_to_tensor_with_resize_and_crop(input_image_filepath, actual_height, actual_width)
196
- media_tensor = torch.nn.functional.pad(media_tensor, padding_values)
197
- call_kwargs['conditioning_items'] = ConditioningItem(media_tensor.to(base_device), 0, 1.0)
198
- elif mode == "video-to-video" and input_video_filepath:
199
- call_kwargs['media_items'] = load_media_file(media_path=input_video_filepath, height=actual_height, width=actual_width, max_frames=int(frames_to_use), padding=padding_values).to(base_device)
200
-
201
- result_images_tensor = None
202
  if improve_texture:
203
- # Alinhamento exato: Use LTXMultiScalePipeline como no app-20.py
204
- active_latent_upsampler = worker['latent_upsampler']
205
- if not active_latent_upsampler:
206
- raise ValueError("Spatial upscaler model not loaded or improve_texture not selected, cannot use multi-scale.")
207
-
208
- multi_scale_pipeline_obj = LTXMultiScalePipeline(worker['base_pipeline'], active_latent_upsampler)
209
-
210
- first_pass_args = self.config.get('first_pass', {}).copy()
211
- first_pass_args['guidance_scale'] = float(guidance_scale) # Override UI
212
- first_pass_args.pop('num_inference_steps', None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
- second_pass_args = self.config.get('second_pass', {}).copy()
215
- second_pass_args['guidance_scale'] = float(guidance_scale) # Override UI
216
- second_pass_args.pop('num_inference_steps', None)
217
 
218
- multi_scale_call_kwargs = call_kwargs.copy()
219
- multi_scale_call_kwargs.update({
220
- "downscale_factor": self.config['downscale_factor'],
221
- "first_pass": first_pass_args,
222
- "second_pass": second_pass_args,
 
 
 
 
 
 
 
223
  })
224
 
225
- print(f"Calling multi-scale pipeline eff. HxW {actual_height}x{actual_width}, Frames {actual_num_frames} - Padded {num_frames_padded} on {base_device}")
226
- result_images_tensor = multi_scale_pipeline_obj(**multi_scale_call_kwargs).images
227
- else:
228
- # Single-pass alinhado
229
- single_pass_call_kwargs = call_kwargs.copy()
230
- first_pass_config_from_yaml = self.config.get('first_pass', {})
231
- single_pass_call_kwargs['timesteps'] = first_pass_config_from_yaml.get('timesteps')
232
- single_pass_call_kwargs['guidance_scale'] = float(guidance_scale) # Override UI
233
- single_pass_call_kwargs['stg_scale'] = first_pass_config_from_yaml.get('stg_scale')
234
- single_pass_call_kwargs['rescaling_scale'] = first_pass_config_from_yaml.get('rescaling_scale')
235
- single_pass_call_kwargs['skip_block_list'] = first_pass_config_from_yaml.get('skip_block_list')
236
- single_pass_call_kwargs.pop('num_inference_steps', None)
237
- single_pass_call_kwargs.pop('first_pass', None)
238
- single_pass_call_kwargs.pop('second_pass', None)
239
- single_pass_call_kwargs.pop('downscale_factor', None)
 
 
 
 
 
 
 
 
240
 
241
- print(f"Calling base pipeline padded HxW {height_padded}x{width_padded}, Frames {actual_num_frames} - Padded {num_frames_padded} on {base_device}")
242
- result_images_tensor = worker['base_pipeline'](**single_pass_call_kwargs).images
243
-
244
- if result_images_tensor is None:
245
- raise ValueError("Generation failed.")
246
-
247
- # Slicing e salvamento alinhados
248
- slice_h_end = -pad_bottom if pad_bottom > 0 else None
249
- slice_w_end = -pad_right if pad_right > 0 else None
250
- result_images_tensor = result_images_tensor[:, :, :actual_num_frames, pad_top:slice_h_end, pad_left:slice_w_end]
251
- video_np = result_images_tensor[0].permute(1, 2, 3, 0).cpu().float().numpy()
252
- video_np = np.clip(video_np, 0, 1) * 255.0
253
- video_np = video_np.astype(np.uint8)
254
 
 
 
 
 
 
 
 
 
255
  temp_dir = tempfile.mkdtemp()
256
  output_video_path = os.path.join(temp_dir, f"output_{used_seed}.mp4")
257
- try:
258
- with imageio.get_writer(output_video_path, fps=call_kwargs['framerate'], macro_block_size=1) as video_writer:
259
- for frame_idx in range(video_np.shape[0]):
260
- if progress_callback:
261
- progress_callback(frame_idx / video_np.shape[0], desc="Saving video")
262
- video_writer.append_data(video_np[frame_idx])
263
- except Exception as e:
264
- print(f"Error saving video with macro_block_size=1: {e}")
265
- with imageio.get_writer(output_video_path, fps=call_kwargs['framerate'], format='FFMPEG', codec='libx264', quality=8) as video_writer:
266
- for frame_idx in range(video_np.shape[0]):
267
- if progress_callback:
268
- progress_callback(frame_idx / video_np.shape[0], desc="Saving video fallback ffmpeg")
269
- video_writer.append_data(video_np[frame_idx])
270
-
271
  return output_video_path, used_seed
272
-
273
  except Exception as e:
274
- print(f"!!!!!!!! ERRO no Worker {worker['id']}: {e} !!!!!!!!")
275
  raise e
276
  finally:
277
- print(f"Worker {worker['id']} Tarefa finalizada. Limpando cache e liberando worker...")
278
- with torch.cuda.device(base_device):
279
- torch.cuda.empty_cache()
280
- with torch.cuda.device(upscaler_device):
281
- torch.cuda.empty_cache()
282
- worker['lock'].release()
283
 
284
  # A instância do serviço é criada aqui, mas os modelos só serão carregados no primeiro clique.
285
- video_generation_service = VideoService()
 
 
1
  # video_service.py
2
 
3
  import torch
 
29
  print(f"ERRO CRÍTICO DURANTE O SETUP: 'setup.py' falhou com código {e.returncode}.")
30
  sys.exit(1)
31
 
32
+ DEPS_DIR = Path("./deps")
33
  LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
34
  if not LTX_VIDEO_REPO_DIR.exists():
35
  run_setup()
 
40
  if str(LTX_VIDEO_REPO_DIR.resolve()) not in sys.path:
41
  sys.path.insert(0, str(LTX_VIDEO_REPO_DIR.resolve()))
42
 
43
+ add_deps_to_path()
44
 
45
  # Importações específicas do modelo
46
  from inference import (
 
122
  return worker
123
  time.sleep(0.1)
124
 
125
+ def generate(self, prompt, negative_prompt, input_image_filepath=None, input_video_filepath=None,
126
+ height=512, width=704, mode="text-to-video", duration=2.0,
127
+ frames_to_use=9, seed=42, randomize_seed=True, guidance_scale=1.0, # Ignorado, mas mantido por compatibilidade
128
  improve_texture=True, progress_callback=None):
129
 
130
+ # A MÁGICA DO LAZY LOADING ACONTECE AQUI
131
  self._ensure_models_are_loaded()
132
+
133
  worker = self._acquire_worker()
134
  base_device = worker['devices']['base']
135
  upscaler_device = worker['devices']['upscaler']
136
 
137
  try:
138
+ # ... (todo o resto do código da função generate permanece exatamente o mesmo) ...
139
+ if mode == "image-to-video" and not input_image_filepath: raise ValueError("Caminho da imagem é obrigatório para o modo image-to-video")
140
+ if mode == "video-to-video" and not input_video_filepath: raise ValueError("Caminho do vídeo é obrigatório para o modo video-to-video")
141
+
 
 
142
  used_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
143
  seed_everething(used_seed)
144
+
145
+ FPS = 24.0; MAX_NUM_FRAMES = 257
146
+ target_frames_rounded = round(duration * FPS)
147
+ n_val = round((float(target_frames_rounded) - 1.0) / 8.0)
 
 
 
148
  actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(n_val * 8 + 1)))
149
 
150
+ height_padded = ((height - 1) // 32 + 1) * 32
151
+ width_padded = ((width - 1) // 32 + 1) * 32
152
+ padding_values = calculate_padding(height, width, height_padded, width_padded)
 
 
 
 
 
 
153
  pad_left, pad_right, pad_top, pad_bottom = padding_values
154
 
155
+ call_kwargs_base = {
156
+ "prompt": prompt, "negative_prompt": negative_prompt, "num_frames": actual_num_frames, "frame_rate": int(FPS),
157
+ "decode_timestep": 0.05, "decode_noise_scale": self.config["decode_noise_scale"],
158
+ "stochastic_sampling": self.config["stochastic_sampling"], "image_cond_noise_scale": 0.025,
159
+ "is_video": True, "vae_per_channel_normalize": True, "mixed_precision": (self.config["precision"] == "mixed_precision"),
160
+ "offload_to_cpu": False, "enhance_prompt": False, "skip_layer_strategy": SkipLayerStrategy.AttentionValues
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  }
162
+
163
+ result_tensor = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  if improve_texture:
165
+ downscale_factor = self.config.get("downscale_factor", 0.5)
166
+ downscaled_height_ideal = int(height_padded * downscale_factor); downscaled_width_ideal = int(width_padded * downscale_factor)
167
+ downscaled_height = ((downscaled_height_ideal - 1) // 32 + 1) * 32; downscaled_width = ((downscaled_width_ideal - 1) // 32 + 1) * 32
168
+
169
+ # --- PASSE 1 ---
170
+ first_pass_kwargs = call_kwargs_base.copy()
171
+ first_pass_kwargs.update({
172
+ "height": downscaled_height, "width": downscaled_width,
173
+ "generator": torch.Generator(device=base_device).manual_seed(used_seed),
174
+ "output_type": "latent", "guidance_scale": 1.0,
175
+ "timesteps": self.config["first_pass"]["timesteps"],
176
+ "stg_scale": self.config["first_pass"]["stg_scale"],
177
+ "rescaling_scale": self.config["first_pass"]["rescaling_scale"],
178
+ "skip_block_list": self.config["first_pass"]["skip_block_list"]
179
+ })
180
+
181
+ if mode == "image-to-video":
182
+ padding_low_res = calculate_padding(downscaled_height, downscaled_width, downscaled_height, downscaled_width)
183
+ media_tensor_low_res = load_image_to_tensor_with_resize_and_crop(input_image_filepath, downscaled_height, downscaled_width)
184
+ media_tensor_low_res = torch.nn.functional.pad(media_tensor_low_res, padding_low_res)
185
+ first_pass_kwargs["conditioning_items"] = [ConditioningItem(media_tensor_low_res.to(base_device), 0, 1.0)]
186
+
187
+ print(f"Worker {worker['id']}: Iniciando passe 1 em {base_device}")
188
+ with torch.no_grad(): low_res_latents = worker['base_pipeline'](**first_pass_kwargs).images
189
 
190
+ low_res_latents = low_res_latents.to(upscaler_device)
191
+ with torch.no_grad(): high_res_latents = worker['latent_upsampler'](low_res_latents)
192
+ high_res_latents = high_res_latents.to(base_device)
193
 
194
+ # --- PASSE 2 ---
195
+ second_pass_kwargs = call_kwargs_base.copy()
196
+ high_res_h, high_res_w = downscaled_height * 2, downscaled_width * 2
197
+ second_pass_kwargs.update({
198
+ "height": high_res_h, "width": high_res_w, "latents": high_res_latents,
199
+ "generator": torch.Generator(device=base_device).manual_seed(used_seed),
200
+ "output_type": "pt", "image_cond_noise_scale": 0.0, "guidance_scale": 1.0,
201
+ "timesteps": self.config["second_pass"]["timesteps"],
202
+ "stg_scale": self.config["second_pass"]["stg_scale"],
203
+ "rescaling_scale": self.config["second_pass"]["rescaling_scale"],
204
+ "skip_block_list": self.config["second_pass"]["skip_block_list"],
205
+ "tone_map_compression_ratio": self.config["second_pass"].get("tone_map_compression_ratio", 0.0)
206
  })
207
 
208
+ if mode == "image-to-video":
209
+ padding_high_res = calculate_padding(high_res_h, high_res_w, high_res_h, high_res_w)
210
+ media_tensor_high_res = load_image_to_tensor_with_resize_and_crop(input_image_filepath, high_res_h, high_res_w)
211
+ media_tensor_high_res = torch.nn.functional.pad(media_tensor_high_res, padding_high_res)
212
+ second_pass_kwargs["conditioning_items"] = [ConditioningItem(media_tensor_high_res.to(base_device), 0, 1.0)]
213
+
214
+ print(f"Worker {worker['id']}: Iniciando passe 2 em {base_device}")
215
+ with torch.no_grad(): result_tensor = worker['base_pipeline'](**second_pass_kwargs).images
216
+
217
+ else: # Passe Único
218
+ single_pass_kwargs = call_kwargs_base.copy()
219
+ first_pass_config = self.config["first_pass"]
220
+ single_pass_kwargs.update({
221
+ "height": height_padded, "width": width_padded, "output_type": "pt",
222
+ "generator": torch.Generator(device=base_device).manual_seed(used_seed),
223
+ "guidance_scale": 1.0, **first_pass_config
224
+ })
225
+ if mode == "image-to-video":
226
+ media_tensor_final = load_image_to_tensor_with_resize_and_crop(input_image_filepath, height_padded, width_padded)
227
+ media_tensor_final = torch.nn.functional.pad(media_tensor_final, padding_values)
228
+ single_pass_kwargs["conditioning_items"] = [ConditioningItem(media_tensor_final.to(base_device), 0, 1.0)]
229
+ elif mode == "video-to-video":
230
+ single_pass_kwargs["media_items"] = load_media_file(media_path=input_video_filepath, height=height_padded, width=width_padded, max_frames=int(frames_to_use), padding=padding_values).to(base_device)
231
 
232
+ print(f"Worker {worker['id']}: Iniciando passe único em {base_device}")
233
+ with torch.no_grad(): result_tensor = worker['base_pipeline'](**single_pass_kwargs).images
 
 
 
 
 
 
 
 
 
 
 
234
 
235
+ if result_tensor.shape[-2:] != (height, width):
236
+ num_frames_final = result_tensor.shape[2]
237
+ videos_tensor = result_tensor.permute(0, 2, 1, 3, 4).reshape(-1, result_tensor.shape[1], result_tensor.shape[3], result_tensor.shape[4])
238
+ videos_resized = torch.nn.functional.interpolate(videos_tensor, size=(height, width), mode='bilinear', align_corners=False)
239
+ result_tensor = videos_resized.reshape(result_tensor.shape[0], num_frames_final, result_tensor.shape[1], height, width).permute(0, 2, 1, 3, 4)
240
+
241
+ result_tensor = result_tensor[:, :, :actual_num_frames, (pad_top if pad_top > 0 else None):(-pad_bottom if pad_bottom > 0 else None), (pad_left if pad_left > 0 else None):(-pad_right if pad_right > 0 else None)]
242
+ video_np = (result_tensor[0].permute(1, 2, 3, 0).cpu().float().numpy() * 255).astype(np.uint8)
243
  temp_dir = tempfile.mkdtemp()
244
  output_video_path = os.path.join(temp_dir, f"output_{used_seed}.mp4")
245
+
246
+ with imageio.get_writer(output_video_path, fps=call_kwargs_base["frame_rate"], codec='libx264', quality=8) as writer:
247
+ for i, frame in enumerate(video_np):
248
+ writer.append_data(frame)
249
+ if progress_callback: progress_callback(i + 1, len(video_np))
 
 
 
 
 
 
 
 
 
250
  return output_video_path, used_seed
251
+
252
  except Exception as e:
253
+ print(f"!!!!!!!! ERRO no Worker {worker['id']} !!!!!!!!\n{e}")
254
  raise e
255
  finally:
256
+ print(f"Worker {worker['id']}: Tarefa finalizada. Limpando cache e liberando worker...")
257
+ with torch.cuda.device(base_device): torch.cuda.empty_cache()
258
+ with torch.cuda.device(upscaler_device): torch.cuda.empty_cache()
259
+ worker["lock"].release()
 
 
260
 
261
  # A instância do serviço é criada aqui, mas os modelos só serão carregados no primeiro clique.
262
+ video_generation_service = VideoService()