Spaces:

preSalesAIAutomation
/

LTXpipeline

Running on Zero

App Files Files Community

preSalesAIAutomation commited on Jul 21

Commit

8fa0161

verified ·

1 Parent(s): e2ffd52

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -35

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ pipe.to("cuda")
 pipe_upsample.to("cuda")
 pipe.vae.enable_tiling()
-def prepare_image_condition(image, size=(480, 480), background=(0, 0, 0)):
     image = ImageOps.contain(image, size)
     canvas = Image.new("RGB", size, background)
     offset = ((size[0] - image.width) // 2, (size[1] - image.height) // 2)
@@ -46,22 +46,22 @@ def generate_video(prompt, image_url):
         raw_image = Image.open(BytesIO(requests.get(image_url).content)).convert("RGB")
         image = prepare_image_condition(raw_image)
-    # Set target resolutions
-    base_width, base_height = 480, 480  # final size (must be divisible by 16)
-    down_width, down_height = 320, 320  # for latent generation (must also be divisible by 16)
-    # Step 1: Generate latents at lower resolution
     latents = pipe(
         prompt=prompt,
         image=image,
         width=down_width,
         height=down_height,
         num_frames=60,
-        num_inference_steps=7,
         output_type="latent",
-        guidance_scale=1.0,
-        decode_timestep=0.05,
-        decode_noise_scale=0.025,
         generator=generator
     ).frames
@@ -74,19 +74,19 @@ def generate_video(prompt, image_url):
     torch.cuda.empty_cache()
     gc.collect()
-    # Step 3: Decode upscaled latents to frames using the pipeline
-    # Create a new pipeline call specifically for decoding
     frames = pipe(
-        prompt="",  # Empty prompt for decode-only
         latents=upscaled_latents,
         width=base_width,
         height=base_height,
         num_frames=60,
-        num_inference_steps=1,  # Minimal steps since we're just decoding
         output_type="pil",
-        guidance_scale=1.0,
-        decode_timestep=0.0,  # Use 0 for pure decoding
-        decode_noise_scale=0.0,  # No noise for decoding
         generator=generator
     ).frames[0]
@@ -131,41 +131,61 @@ def generate_video(prompt, image_url):
         with open("subtitles.srt", "w", encoding="utf-8") as f:
             f.write(srt_content)
-    # Step 7: Merge video + audio + subtitles
     final_output = "final_with_audio.mp4"
     try:
         (
             ffmpeg
             .input(video_path)
-            .output(
-                final_output,
-                vf="subtitles=subtitles.srt",
-                **{"c:v": "libx264", "c:a": "aac"},
-                loglevel="error"
-            )
-            .run(overwrite_output=True)
         )
-        # Add audio track
         (
             ffmpeg
-            .input(final_output)
-            .input("voice.wav")
             .output(
-                "final_complete.mp4",
-                **{"c:v": "copy", "c:a": "aac"},
                 shortest=None,
-                loglevel="error"
             )
-            .run(overwrite_output=True)
         )
-        return "final_complete.mp4"
     except Exception as e:
         print(f"FFmpeg error: {e}")
-        # Fallback: return video without audio/subtitles
-        return video_path
 def format_time(seconds):
     """Convert seconds to SRT time format"""
@@ -184,7 +204,7 @@ demo = gr.Interface(
     ],
     outputs=gr.Video(label="Generated Video"),
     title="🎬 LTX AI Video Generator",
-    description="AI-powered video with voiceover and subtitles. Now outputs at 480x480 resolution."
 )
 demo.launch()

 pipe_upsample.to("cuda")
 pipe.vae.enable_tiling()
+def prepare_image_condition(image, size=(512, 512), background=(0, 0, 0)):
     image = ImageOps.contain(image, size)
     canvas = Image.new("RGB", size, background)
     offset = ((size[0] - image.width) // 2, (size[1] - image.height) // 2)
         raw_image = Image.open(BytesIO(requests.get(image_url).content)).convert("RGB")
         image = prepare_image_condition(raw_image)
+    # Set target resolutions - using higher quality settings
+    base_width, base_height = 512, 512  # Increased from 480x480
+    down_width, down_height = 384, 384  # Increased from 320x320 for better quality
+    # Step 1: Generate latents at lower resolution with better quality settings
     latents = pipe(
         prompt=prompt,
         image=image,
         width=down_width,
         height=down_height,
         num_frames=60,
+        num_inference_steps=12,  # Increased from 7 for better quality
         output_type="latent",
+        guidance_scale=2.0,  # Increased from 1.0 for better prompt adherence
+        decode_timestep=0.1,  # Adjusted for better quality
+        decode_noise_scale=0.1,  # Adjusted for better quality
         generator=generator
     ).frames
     torch.cuda.empty_cache()
     gc.collect()
+    # Step 3: Decode upscaled latents to frames using the pipeline with better settings
     frames = pipe(
+        prompt=prompt,  # Use original prompt for better consistency
         latents=upscaled_latents,
         width=base_width,
         height=base_height,
         num_frames=60,
+        num_inference_steps=15,  # Increased for better decoding quality
         output_type="pil",
+        guidance_scale=2.0,  # Consistent with generation
+        decode_timestep=0.1,
+        decode_noise_scale=0.1,
+        denoise_strength=0.2,  # Reduced for less noise
         generator=generator
     ).frames[0]
         with open("subtitles.srt", "w", encoding="utf-8") as f:
             f.write(srt_content)
+    # Step 7: Merge video + audio + subtitles with proper FFmpeg handling
     final_output = "final_with_audio.mp4"
     try:
+        # First, create video with subtitles
+        video_with_subs = "video_with_subs.mp4"
         (
             ffmpeg
             .input(video_path)
+            .filter('subtitles', 'subtitles.srt')
+            .output(video_with_subs, vcodec='libx264', acodec='aac', loglevel='error')
+            .overwrite_output()
+            .run()
         )
+        # Then add audio track
         (
             ffmpeg
+            .input(video_with_subs)
+            .input('voice.wav')
             .output(
+                final_output,
+                vcodec='copy',
+                acodec='aac',
                 shortest=None,
+                loglevel='error'
             )
+            .overwrite_output()
+            .run()
         )
+        return final_output
     except Exception as e:
         print(f"FFmpeg error: {e}")
+        # Fallback: try simpler approach without subtitles
+        try:
+            (
+                ffmpeg
+                .input(video_path)
+                .input('voice.wav')
+                .output(
+                    final_output,
+                    vcodec='libx264',
+                    acodec='aac',
+                    shortest=None,
+                    loglevel='error'
+                )
+                .overwrite_output()
+                .run()
+            )
+            return final_output
+        except Exception as e2:
+            print(f"FFmpeg fallback error: {e2}")
+            # Final fallback: return original video
+            return video_path
 def format_time(seconds):
     """Convert seconds to SRT time format"""
     ],
     outputs=gr.Video(label="Generated Video"),
     title="🎬 LTX AI Video Generator",
+    description="AI-powered video with voiceover and subtitles. Now outputs at 512x512 resolution with improved quality."
 )
 demo.launch()