Spaces:

Pie31415
/

control-animation

Build error

App Files Files Community

Pie31415 commited on May 5, 2023

Commit

527b597

1 Parent(s): ac67678

updated

Browse files

Files changed (7) hide show

app.py +1 -6
text_to_animation/model.py +75 -321
text_to_animation/models/cross_frame_attention_flax.py +0 -1
text_to_animation/models/unet_3d_blocks_flax.py +717 -0
text_to_animation/models/unet_3d_condition_flax.py +611 -0
text_to_animation/pipelines/text_to_video_pipeline_flax.py +887 -142
webui/app_control_animation.py +87 -120

app.py CHANGED Viewed

@@ -39,7 +39,7 @@ Our code uses <a href="https://www.humphreyshi.com/home">Text2Video-Zero</a> and
 notice = """
 <p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
 <br/>
-<a href="https://github.com/Pie31415/control-animation">
 <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
 </p>
 """
@@ -51,13 +51,8 @@ with gr.Blocks(css="style.css") as demo:
     if on_huggingspace:
         gr.HTML(notice)
-    # NOTE: In our final demo we should consider removing zero-shot t2v and pose conditional
     with gr.Tab("Control Animation"):
         create_demo_animation(model)
-    # with gr.Tab("Zero-Shot Text2Video"):
-    #     create_demo_text_to_video(model)
-    # with gr.Tab("Pose Conditional"):
-    #     create_demo_pose(model)
 if on_huggingspace:
     demo.queue(max_size=20)

 notice = """
 <p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
 <br/>
+<a href="https://huggingface.co/spaces/Pie31415/control-animation?duplicate=true">
 <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
 </p>
 """
     if on_huggingspace:
         gr.HTML(notice)
     with gr.Tab("Control Animation"):
         create_demo_animation(model)
 if on_huggingspace:
     demo.queue(max_size=20)

text_to_animation/model.py CHANGED Viewed

@@ -3,7 +3,6 @@ from enum import Enum
 import gc
 import numpy as np
 import jax.numpy as jnp
-import tomesd
 import jax
 from PIL import Image
@@ -20,9 +19,12 @@ from diffusers import (
     FlaxAutoencoderKL,
     FlaxStableDiffusionControlNetPipeline,
     StableDiffusionPipeline,
 )
-from text_to_animation.models.unet_2d_condition_flax import FlaxUNet2DConditionModel
-from text_to_animation.models.controlnet_flax import FlaxControlNetModel
 from text_to_animation.pipelines.text_to_video_pipeline_flax import (
     FlaxTextToVideoPipeline,
@@ -48,37 +50,31 @@ def replicate_devices(array):
 class ControlAnimationModel:
-    def __init__(self, device, dtype, **kwargs):
-        self.device = device
         self.dtype = dtype
         self.rng = jax.random.PRNGKey(0)
-        self.pipe_dict = {
-            ModelType.Text2Video: FlaxTextToVideoPipeline,  # TODO: Replace with our TextToVideo JAX Pipeline
-            ModelType.ControlNetPose: FlaxStableDiffusionControlNetPipeline,
-        }
         self.pipe = None
         self.model_type = None
         self.states = {}
         self.model_name = ""
-        self.from_local = True  # if the attn model is available in local (after adaptation by adapt_attn.py)
     def set_model(
         self,
-        model_type: ModelType,
         model_id: str,
-        controlnet,
-        controlnet_params,
-        tokenizer,
-        scheduler,
-        scheduler_state,
         **kwargs,
     ):
         if hasattr(self, "pipe") and self.pipe is not None:
             del self.pipe
             self.pipe = None
         gc.collect()
         scheduler, scheduler_state = FlaxDDIMScheduler.from_pretrained(
             model_id, subfolder="scheduler", from_pt=True
         )
@@ -86,17 +82,12 @@ class ControlAnimationModel:
         feature_extractor = CLIPFeatureExtractor.from_pretrained(
             model_id, subfolder="feature_extractor"
         )
-        if self.from_local:
-            unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(
-                f'./{model_id.split("/")[-1]}',
-                subfolder="unet",
-                from_pt=True,
-                dtype=self.dtype,
-            )
-        else:
-            unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(
-                model_id, subfolder="unet", from_pt=True, dtype=self.dtype
-            )
         vae, vae_params = FlaxAutoencoderKL.from_pretrained(
             model_id, subfolder="vae", from_pt=True, dtype=self.dtype
         )
@@ -108,6 +99,7 @@ class ControlAnimationModel:
             text_encoder=text_encoder,
             tokenizer=tokenizer,
             unet=unet,
             controlnet=controlnet,
             scheduler=scheduler,
             safety_checker=None,
@@ -121,313 +113,52 @@ class ControlAnimationModel:
             "text_encoder": text_encoder.params,
         }
         self.p_params = jax_utils.replicate(self.params)
-        self.model_type = model_type
         self.model_name = model_id
-    # def inference_chunk(self, image, frame_ids, prompt, negative_prompt, **kwargs):
-    #     prompt_ids = self.pipe.prepare_text_inputs(prompt)
-    #     n_prompt_ids = self.pipe.prepare_text_inputs(negative_prompt)
-    #     latents = kwargs.pop('latents')
-    #     # rng = jax.random.split(self.rng, jax.device_count())
-    #     prng, self.rng = jax.random.split(self.rng)
-    #     #prng = jax.numpy.stack([prng] * jax.device_count())#same prng seed on every device
-    #     prng_seed = jax.random.split(prng, jax.device_count())
-    #     image = replicate_devices(image[frame_ids])
-    #     latents = replicate_devices(latents)
-    #     prompt_ids = replicate_devices(prompt_ids)
-    #     n_prompt_ids = replicate_devices(n_prompt_ids)
-    #     return (self.pipe(image=image,
-    #                         latents=latents,
-    #                         prompt_ids=prompt_ids,
-    #                         neg_prompt_ids=n_prompt_ids,
-    #                         params=self.p_params,
-    #                         prng_seed=prng_seed, jit = True,
-    #                         ).images)[0]
-    def inference(self, image, split_to_chunks=False, chunk_size=8, **kwargs):
-        if not hasattr(self, "pipe") or self.pipe is None:
-            return
-        if "merging_ratio" in kwargs:
-            merging_ratio = kwargs.pop("merging_ratio")
-            # if merging_ratio > 0:
-            tomesd.apply_patch(self.pipe, ratio=merging_ratio)
-        # f = image.shape[0]
-        assert "prompt" in kwargs
-        prompt = [kwargs.pop("prompt")]
-        negative_prompt = [kwargs.pop("negative_prompt", "")]
-        frames_counter = 0
-        # Processing chunk-by-chunk
-        if split_to_chunks:
-            pass
-            # # not tested
-            # f = image.shape[0]
-            # chunk_ids = np.arange(0, f, chunk_size - 1)
-            # result = []
-            # for i in range(len(chunk_ids)):
-            #     ch_start = chunk_ids[i]
-            #     ch_end = f if i == len(chunk_ids) - 1 else chunk_ids[i + 1]
-            #     frame_ids = [0] + list(range(ch_start, ch_end))
-            #     print(f'Processing chunk {i + 1} / {len(chunk_ids)}')
-            #     result.append(self.inference_chunk(image=image,
-            #                                        frame_ids=frame_ids,
-            #                                        prompt=prompt,
-            #                                        negative_prompt=negative_prompt,
-            #                                        **kwargs).images[1:])
-            #     frames_counter += len(chunk_ids)-1
-            #     if on_huggingspace and frames_counter >= 80:
-            #         break
-            # result = np.concatenate(result)
-            # return result
-        else:
-            if "jit" in kwargs and kwargs.pop("jit"):
-                prompt_ids = self.pipe.prepare_text_inputs(prompt)
-                n_prompt_ids = self.pipe.prepare_text_inputs(negative_prompt)
-                latents = kwargs.pop("latents")
-                prng, self.rng = jax.random.split(self.rng)
-                prng_seed = jax.random.split(prng, jax.device_count())
-                image = replicate_devices(image)
-                latents = replicate_devices(latents)
-                prompt_ids = replicate_devices(prompt_ids)
-                n_prompt_ids = replicate_devices(n_prompt_ids)
-                return (
-                    self.pipe(
-                        image=image,
-                        latents=latents,
-                        prompt_ids=prompt_ids,
-                        neg_prompt_ids=n_prompt_ids,
-                        params=self.p_params,
-                        prng_seed=prng_seed,
-                        jit=True,
-                    ).images
-                )[0]
-            else:
-                prompt_ids = self.pipe.prepare_text_inputs(prompt)
-                n_prompt_ids = self.pipe.prepare_text_inputs(negative_prompt)
-                latents = kwargs.pop("latents")
-                prng_seed, self.rng = jax.random.split(self.rng)
-                return self.pipe(
-                    image=image,
-                    latents=latents,
-                    prompt_ids=prompt_ids,
-                    neg_prompt_ids=n_prompt_ids,
-                    params=self.params,
-                    prng_seed=prng_seed,
-                    jit=False,
-                ).images
-    def process_controlnet_pose(
         self,
-        video_path,
-        prompt,
-        chunk_size=8,
-        watermark="Picsart AI Research",
-        merging_ratio=0.0,
-        num_inference_steps=20,
-        controlnet_conditioning_scale=1.0,
-        guidance_scale=9.0,
-        seed=42,
-        eta=0.0,
-        resolution=512,
-        use_cf_attn=True,
-        save_path=None,
-    ):
-        print("Module Pose")
         video_path = gradio_utils.motion_to_video_path(video_path)
-        if self.model_type != ModelType.ControlNetPose:
-            controlnet = FlaxControlNetModel.from_pretrained(
-                "fusing/stable-diffusion-v1-5-controlnet-openpose"
-            )
-            self.set_model(
-                ModelType.ControlNetPose,
-                model_id="runwayml/stable-diffusion-v1-5",
-                controlnet=controlnet,
-            )
-            self.pipe.scheduler = FlaxDDIMScheduler.from_config(
-                self.pipe.scheduler.config
-            )
-            if use_cf_attn:
-                self.pipe.unet.set_attn_processor(processor=self.controlnet_attn_proc)
-                self.pipe.controlnet.set_attn_processor(
-                    processor=self.controlnet_attn_proc
-                )
-        video_path = (
-            gradio_utils.motion_to_video_path(video_path)
-            if "Motion" in video_path
-            else video_path
-        )
-        added_prompt = "best quality, extremely detailed, HD, ultra-realistic, 8K, HQ, masterpiece, trending on artstation, art, smooth"
-        negative_prompts = "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer difits, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic"
         video, fps = utils.prepare_video(
-            video_path, resolution, self.device, self.dtype, False, output_fps=4
-        )
-        control = (
-            utils.pre_process_pose(video, apply_pose_detect=False)
-            .to(self.device)
-            .to(self.dtype)
         )
-        f, _, h, w = video.shape
-        self.generator.manual_seed(seed)
-        latents = torch.randn(
-            (1, 4, h // 8, w // 8),
-            dtype=self.dtype,
-            device=self.device,
-            generator=self.generator,
-        )
-        latents = latents.repeat(f, 1, 1, 1)
-        result = self.inference(
-            image=control,
-            prompt=prompt + ", " + added_prompt,
-            height=h,
-            width=w,
-            negative_prompt=negative_prompts,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            controlnet_conditioning_scale=controlnet_conditioning_scale,
-            eta=eta,
-            latents=latents,
-            seed=seed,
-            output_type="numpy",
-            split_to_chunks=True,
-            chunk_size=chunk_size,
-            merging_ratio=merging_ratio,
         )
-        return utils.create_gif(
-            result,
-            fps,
-            path=save_path,
-            watermark=gradio_utils.logo_name_to_path(watermark),
-        )
-    def process_text2video(
-        self,
-        prompt,
-        model_name="dreamlike-art/dreamlike-photoreal-2.0",
-        motion_field_strength_x=12,
-        motion_field_strength_y=12,
-        t0=44,
-        t1=47,
-        n_prompt="",
-        chunk_size=8,
-        video_length=8,
-        watermark="Picsart AI Research",
-        merging_ratio=0.0,
-        seed=0,
-        resolution=512,
-        fps=2,
-        use_cf_attn=True,
-        use_motion_field=True,
-        smooth_bg=False,
-        smooth_bg_strength=0.4,
-        path=None,
-    ):
-        print("Module Text2Video")
-        if self.model_type != ModelType.Text2Video or model_name != self.model_name:
-            print("Model update")
-            unet = FlaxUNet2DConditionModel.from_pretrained(
-                model_name, subfolder="unet"
-            )
-            self.set_model(ModelType.Text2Video, model_id=model_name, unet=unet)
-            self.pipe.scheduler = FlaxDDIMScheduler.from_config(
-                self.pipe.scheduler.config
-            )
-            if use_cf_attn:
-                self.pipe.unet.set_attn_processor(processor=self.text2video_attn_proc)
-        self.generator.manual_seed(seed)
-        added_prompt = "high quality, HD, 8K, trending on artstation, high focus, dramatic lighting"
-        negative_prompts = "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer difits, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic"
-        prompt = prompt.rstrip()
-        if len(prompt) > 0 and (prompt[-1] == "," or prompt[-1] == "."):
-            prompt = prompt.rstrip()[:-1]
-        prompt = prompt.rstrip()
-        prompt = prompt + ", " + added_prompt
-        if len(n_prompt) > 0:
-            negative_prompt = n_prompt
-        else:
-            negative_prompt = None
-        result = self.inference(
-            prompt=prompt,
-            video_length=video_length,
-            height=resolution,
-            width=resolution,
-            num_inference_steps=50,
-            guidance_scale=7.5,
-            guidance_stop_step=1.0,
-            t0=t0,
-            t1=t1,
-            motion_field_strength_x=motion_field_strength_x,
-            motion_field_strength_y=motion_field_strength_y,
-            use_motion_field=use_motion_field,
-            smooth_bg=smooth_bg,
-            smooth_bg_strength=smooth_bg_strength,
-            seed=seed,
-            output_type="numpy",
-            negative_prompt=negative_prompt,
-            merging_ratio=merging_ratio,
-            split_to_chunks=True,
-            chunk_size=chunk_size,
-        )
-        return utils.create_video(
-            result, fps, path=path, watermark=gradio_utils.logo_name_to_path(watermark)
-        )
-    @staticmethod
-    def to_pil_images(images: torch.Tensor) -> List[Image.Image]:
-        images = (images / 2 + 0.5).clamp(0, 1)
-        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
-        images = np.round(images * 255).astype(np.uint8)
-        return [Image.fromarray(image) for image in images]
-    def generate_initial_frames(
-        self,
-        prompt: str,
-        model_link: str = "dreamlike-art/dreamlike-photoreal-2.0",
-        is_safetensor: bool = False,
-        n_prompt: str = "",
-        width: int = 512,
-        height: int = 512,
-        # batch_count: int = 4,
-        # batch_size: int = 1,
-        cfg_scale: float = 7.0,
-        seed: int = 0,
-    ) -> List[Image.Image]:
-        generator = torch.Generator(device=self.device).manual_seed(seed)
-        pipe = StableDiffusionPipeline.from_pretrained(model_link)
-        batch_size = 4
-        prompt = [prompt] * batch_size
-        negative_prompt = [n_prompt] * batch_size
-        images = pipe(
-            prompt,
-            negative_prompt=negative_prompt,
-            width=width,
-            height=height,
-            guidance_scale=cfg_scale,
-            generator=generator,
-        ).images
-        pil_images = self.to_pil_images(images)
-        return pil_images
     def generate_animation(
         self,
         prompt: str,
         model_link: str = "dreamlike-art/dreamlike-photoreal-2.0",
-        is_safetensor: bool = False,
         motion_field_strength_x: int = 12,
         motion_field_strength_y: int = 12,
         t0: int = 44,
@@ -445,6 +176,29 @@ class ControlAnimationModel:
         smooth_bg_strength: float = 0.4,
         path: str = None,
     ):
-        if is_safetensor and model_link[-len(".safetensors") :] == ".safetensors":
-            pipe = utils.load_safetensors_model(model_link)
-        return

 import gc
 import numpy as np
 import jax.numpy as jnp
 import jax
 from PIL import Image
     FlaxAutoencoderKL,
     FlaxStableDiffusionControlNetPipeline,
     StableDiffusionPipeline,
+    FlaxUNet2DConditionModel,
 )
+from text_to_animation.models.unet_2d_condition_flax import (
+    FlaxUNet2DConditionModel as CustomFlaxUNet2DConditionModel,
+)
+from diffusers import FlaxControlNetModel
 from text_to_animation.pipelines.text_to_video_pipeline_flax import (
     FlaxTextToVideoPipeline,
 class ControlAnimationModel:
+    def __init__(self, dtype, **kwargs):
         self.dtype = dtype
         self.rng = jax.random.PRNGKey(0)
         self.pipe = None
         self.model_type = None
         self.states = {}
         self.model_name = ""
     def set_model(
         self,
         model_id: str,
         **kwargs,
     ):
         if hasattr(self, "pipe") and self.pipe is not None:
             del self.pipe
             self.pipe = None
         gc.collect()
+        controlnet, controlnet_params = FlaxControlNetModel.from_pretrained(
+            "fusing/stable-diffusion-v1-5-controlnet-openpose",
+            from_pt=True,
+            dtype=jnp.float16,
+        )
         scheduler, scheduler_state = FlaxDDIMScheduler.from_pretrained(
             model_id, subfolder="scheduler", from_pt=True
         )
         feature_extractor = CLIPFeatureExtractor.from_pretrained(
             model_id, subfolder="feature_extractor"
         )
+        unet, unet_params = CustomFlaxUNet2DConditionModel.from_pretrained(
+            model_id, subfolder="unet", from_pt=True, dtype=self.dtype
+        )
+        unet_vanilla, _ = FlaxUNet2DConditionModel.from_pretrained(
+            model_id, subfolder="unet", from_pt=True, dtype=self.dtype
+        )
         vae, vae_params = FlaxAutoencoderKL.from_pretrained(
             model_id, subfolder="vae", from_pt=True, dtype=self.dtype
         )
             text_encoder=text_encoder,
             tokenizer=tokenizer,
             unet=unet,
+            unet_vanilla=unet_vanilla,
             controlnet=controlnet,
             scheduler=scheduler,
             safety_checker=None,
             "text_encoder": text_encoder.params,
         }
         self.p_params = jax_utils.replicate(self.params)
         self.model_name = model_id
+    def generate_initial_frames(
         self,
+        prompt: str,
+        video_path: str,
+        n_prompt: str = "",
+        num_imgs: int = 4,
+        resolution: int = 512,
+        model_id: str = "runwayml/stable-diffusion-v1-5",
+    ) -> List[Image.Image]:
+        self.set_model(model_id=model_id)
         video_path = gradio_utils.motion_to_video_path(video_path)
+        added_prompt = "high quality, best quality, HD, clay stop-motion, claymation, HQ, masterpiece, art, smooth"
+        prompts = added_prompt + ", " + prompt
+        added_n_prompt = "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer difits, cropped, worst quality, low quality, deformed body, bloated, ugly"
+        negative_prompts = added_n_prompt + ", " + n_prompt
         video, fps = utils.prepare_video(
+            video_path, resolution, None, self.dtype, False, output_fps=4
         )
+        control = utils.pre_process_pose(video, apply_pose_detect=False)
+        seeds = [seed for seed in jax.random.randint(self.rng, [num_imgs], 0, 65536)]
+        prngs = [jax.random.PRNGKey(seed) for seed in seeds]
+        images = self.pipe.generate_starting_frames(
+            params=self.params,
+            prngs=prngs,
+            controlnet_image=control,
+            prompt=prompts,
+            neg_prompt=negative_prompts,
         )
+        images = [np.array(images[i]) for i in range(images.shape[0])]
+        return images
     def generate_animation(
         self,
         prompt: str,
+        initial_frame_index: int,
+        input_video_path: str,
         model_link: str = "dreamlike-art/dreamlike-photoreal-2.0",
         motion_field_strength_x: int = 12,
         motion_field_strength_y: int = 12,
         t0: int = 44,
         smooth_bg_strength: float = 0.4,
         path: str = None,
     ):
+        video_path = gradio_utils.motion_to_video_path(video_path)
+        # added_prompt = 'best quality, HD, clay stop-motion, claymation, HQ, masterpiece, art, smooth'
+        # added_prompt = 'high quality, anatomically correct, clay stop-motion, aardman, claymation, smooth'
+        added_n_prompt = "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer difits, cropped, worst quality, low quality, deformed body, bloated, ugly"
+        negative_prompts = added_n_prompt + ", " + n_prompt
+        video, fps = utils.prepare_video(
+            video_path, resolution, None, self.dtype, False, output_fps=4
+        )
+        control = utils.pre_process_pose(video, apply_pose_detect=False)
+        f, _, h, w = video.shape
+        prng_seed = jax.random.PRNGKey(seed)
+        vid = self.pipe.generate_video(
+            prompt,
+            image=control,
+            params=self.params,
+            prng_seed=prng_seed,
+            neg_prompt="",
+            controlnet_conditioning_scale=1.0,
+            motion_field_strength_x=3,
+            motion_field_strength_y=4,
+            jit=True,
+        ).image
+        return utils.create_gif(np.array(vid), 4, path=None, watermark=None)

text_to_animation/models/cross_frame_attention_flax.py CHANGED Viewed

@@ -50,7 +50,6 @@ class FlaxCrossFrameAttention(nn.Module):
         batch_size: The number that represents actual batch size, other than the frames.
             For example, using calling unet with a single prompt and num_images_per_prompt=1, batch_size should be
             equal to 2, due to classifier-free guidance.
     """
     query_dim: int
     heads: int = 8

         batch_size: The number that represents actual batch size, other than the frames.
             For example, using calling unet with a single prompt and num_images_per_prompt=1, batch_size should be
             equal to 2, due to classifier-free guidance.
     """
     query_dim: int
     heads: int = 8

text_to_animation/models/unet_3d_blocks_flax.py ADDED Viewed

	@@ -0,0 +1,717 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+# from .resnet import Downsample2D, ResnetBlock2D, TemporalConvLayer, Upsample2D
+# from diffusers.models.transformer_2d import Transformer2DModel
+# from .transformer_temporal import TransformerTemporalModel
+from diffusers.models.resnet_flax import (
+    FlaxDownsample2D,
+    FlaxResnetBlock2D,
+    FlaxUpsample2D,
+)
+from diffusers.models.attention_flax import FlaxTransformer2DModel
+from diffusers.models.transformer_temporal import (
+    TransformerTemporalModel,
+)  # TODO: convert to flax
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=True,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
+    if down_block_type == "DownBlock3D":
+        return DownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "CrossAttnDownBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnDownBlock3D"
+            )
+        return CrossAttnDownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=True,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
+    if up_block_type == "UpBlock3D":
+        return UpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "CrossAttnUpBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnUpBlock3D"
+            )
+        return CrossAttnUpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class FlaxUNetMidBlock3DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=True,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+        # there is always at least one resnet
+        resnets = [
+            FlaxResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        temp_convs = [
+            TemporalConvLayer(
+                in_channels,
+                in_channels,
+                dropout=0.1,
+            )
+        ]
+        attentions = []
+        temp_attentions = []
+        for _ in range(num_layers):
+            attentions.append(
+                Transformer2DModel(
+                    in_channels // attn_num_head_channels,
+                    attn_num_head_channels,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temp_attentions.append(
+                TransformerTemporalModel(
+                    in_channels // attn_num_head_channels,
+                    attn_num_head_channels,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    in_channels,
+                    in_channels,
+                    dropout=0.1,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        self.attentions = nn.ModuleList(attentions)
+        self.temp_attentions = nn.ModuleList(temp_attentions)
+    def forward(
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        num_frames=1,
+        cross_attention_kwargs=None,
+    ):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        hidden_states = self.temp_convs[0](hidden_states, num_frames=num_frames)
+        for attn, temp_attn, resnet, temp_conv in zip(
+            self.attentions, self.temp_attentions, self.resnets[1:], self.temp_convs[1:]
+        ):
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
+            hidden_states = temp_attn(
+                hidden_states,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+        return hidden_states
+class CrossAttnDownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        temp_attentions = []
+        temp_convs = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                    dropout=0.1,
+                )
+            )
+            attentions.append(
+                Transformer2DModel(
+                    out_channels // attn_num_head_channels,
+                    attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temp_attentions.append(
+                TransformerTemporalModel(
+                    out_channels // attn_num_head_channels,
+                    attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        self.attentions = nn.ModuleList(attentions)
+        self.temp_attentions = nn.ModuleList(temp_attentions)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        num_frames=1,
+        cross_attention_kwargs=None,
+    ):
+        # TODO(Patrick, William) - attention mask is not used
+        output_states = ()
+        for resnet, temp_conv, attn, temp_attn in zip(
+            self.resnets, self.temp_convs, self.attentions, self.temp_attentions
+        ):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
+            hidden_states = temp_attn(
+                hidden_states,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class DownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        temp_convs = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                    dropout=0.1,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None, num_frames=1):
+        output_states = ()
+        for resnet, temp_conv in zip(self.resnets, self.temp_convs):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class CrossAttnUpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        temp_convs = []
+        attentions = []
+        temp_attentions = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                    dropout=0.1,
+                )
+            )
+            attentions.append(
+                Transformer2DModel(
+                    out_channels // attn_num_head_channels,
+                    attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temp_attentions.append(
+                TransformerTemporalModel(
+                    out_channels // attn_num_head_channels,
+                    attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        self.attentions = nn.ModuleList(attentions)
+        self.temp_attentions = nn.ModuleList(temp_attentions)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+        attention_mask=None,
+        num_frames=1,
+        cross_attention_kwargs=None,
+    ):
+        # TODO(Patrick, William) - attention mask is not used
+        for resnet, temp_conv, attn, temp_attn in zip(
+            self.resnets, self.temp_convs, self.attentions, self.temp_attentions
+        ):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
+            hidden_states = temp_attn(
+                hidden_states,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+            ).sample
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class UpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        temp_convs = []
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                    dropout=0.1,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        upsample_size=None,
+        num_frames=1,
+    ):
+        for resnet, temp_conv in zip(self.resnets, self.temp_convs):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states

text_to_animation/models/unet_3d_condition_flax.py ADDED Viewed

	@@ -0,0 +1,611 @@

+# Copyright 2023 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
+# Copyright 2023 The ModelScope Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders import UNet2DConditionLoadersMixin
+from ..utils import BaseOutput, logging
+from .attention_processor import AttentionProcessor, AttnProcessor
+from .embeddings import TimestepEmbedding, Timesteps
+from .modeling_utils import ModelMixin
+from .transformer_temporal import TransformerTemporalModel
+from .unet_3d_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor
+class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    UNet3DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+    and returns sample shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the models (such as downloading or saving, etc.)
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, it will skip the normalization and activation layers in post-processing
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+    """
+    _supports_gradient_checkpointing = False
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1024,
+        attention_head_dim: Union[int, Tuple[int]] = 64,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(
+            down_block_types
+        ):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        conv_in_kernel = 3
+        conv_out_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=conv_in_kernel,
+            padding=conv_in_padding,
+        )
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, 0)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+        )
+        self.transformer_in = TransformerTemporalModel(
+            num_attention_heads=8,
+            attention_head_dim=attention_head_dim,
+            in_channels=block_out_channels[0],
+            num_layers=1,
+        )
+        # class embedding
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=False,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlock3DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim[-1],
+            resnet_groups=norm_num_groups,
+            dual_cross_attention=False,
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[
+                min(i + 1, len(block_out_channels) - 1)
+            ]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                dual_cross_attention=False,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0],
+                num_groups=norm_num_groups,
+                eps=norm_eps,
+            )
+            self.conv_act = nn.SiLU()
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0],
+            out_channels,
+            kernel_size=conv_out_kernel,
+            padding=conv_out_padding,
+        )
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(
+            name: str,
+            module: torch.nn.Module,
+            processors: Dict[str, AttentionProcessor],
+        ):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = (
+            num_sliceable_layers * [slice_size]
+            if not isinstance(slice_size, list)
+            else slice_size
+        )
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(
+            module: torch.nn.Module, slice_size: List[int]
+        ):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]
+    ):
+        r"""
+        Parameters:
+            `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                of **all** `Attention` layers.
+            In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.:
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(AttnProcessor())
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(
+            module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)
+        ):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, num_frames, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet3DConditionOutput`] instead of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+        Returns:
+            [`~models.unet_2d_condition.UNet3DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet3DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        num_frames = sample.shape[2]
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        emb = emb.repeat_interleave(repeats=num_frames, dim=0)
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(
+            repeats=num_frames, dim=0
+        )
+        # 2. pre-process
+        sample = sample.permute(0, 2, 1, 3, 4).reshape(
+            (sample.shape[0] * num_frames, -1) + sample.shape[3:]
+        )
+        sample = self.conv_in(sample)
+        sample = self.transformer_in(
+            sample, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs
+        ).sample
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if (
+                hasattr(downsample_block, "has_cross_attention")
+                and downsample_block.has_cross_attention
+            ):
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample, temb=emb, num_frames=num_frames
+                )
+            down_block_res_samples += res_samples
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = (
+                    down_block_res_sample + down_block_additional_residual
+                )
+                new_down_block_res_samples += (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+        if mid_block_additional_residual is not None:
+            sample = sample + mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[
+                : -len(upsample_block.resnets)
+            ]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if (
+                hasattr(upsample_block, "has_cross_attention")
+                and upsample_block.has_cross_attention
+            ):
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    num_frames=num_frames,
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        # reshape to (batch, channel, framerate, width, height)
+        sample = (
+            sample[None, :]
+            .reshape((-1, num_frames) + sample.shape[1:])
+            .permute(0, 2, 1, 3, 4)
+        )
+        if not return_dict:
+            return (sample,)
+        return UNet3DConditionOutput(sample=sample)

text_to_animation/pipelines/text_to_video_pipeline_flax.py CHANGED Viewed

@@ -6,11 +6,16 @@ import jax.numpy as jnp
 import numpy as np
 from flax.core.frozen_dict import FrozenDict
 from flax.jax_utils import unreplicate
 from flax.training.common_utils import shard
 from PIL import Image
 from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel
 from einops import rearrange, repeat
-from diffusers.models import FlaxAutoencoderKL, FlaxControlNetModel, FlaxUNet2DConditionModel
 from diffusers.schedulers import (
     FlaxDDIMScheduler,
     FlaxDPMSolverMultistepScheduler,
@@ -20,17 +25,24 @@ from diffusers.schedulers import (
 from diffusers.utils import PIL_INTERPOLATION, logging, replace_example_docstring
 from diffusers.pipelines.pipeline_flax_utils import FlaxDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionPipelineOutput
-from diffusers.pipelines.stable_diffusion.safety_checker_flax import FlaxStableDiffusionSafetyChecker
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 """
 Text2Video-Zero:
  - Inputs: Prompt, Pose Control via mp4/gif, First Frame (?)
  - JAX implementation
  - 3DUnet to replace 2DUnetConditional
 """
-DEBUG = False # Set to True to use python for loop instead of jax.fori_loop for easier debugging
 EXAMPLE_DOC_STRING = """
     Examples:
@@ -89,16 +101,22 @@ EXAMPLE_DOC_STRING = """
         >>> output_images.save("generated_image.png")
         ```
 """
 class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
     def __init__(
         self,
-        vae: FlaxAutoencoderKL,
-        text_encoder: FlaxCLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: FlaxUNet2DConditionModel,
-        controlnet: FlaxControlNetModel,
         scheduler: Union[
-            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
         ],
         safety_checker: FlaxStableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
@@ -122,6 +140,7 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
             text_encoder=text_encoder,
             tokenizer=tokenizer,
             unet=unet,
             controlnet=controlnet,
             scheduler=scheduler,
             safety_checker=safety_checker,
@@ -135,30 +154,50 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         else:
             eps = jax.random.normal(prng, x0.shape, dtype=text_embeddings.dtype)
             alpha_vec = jnp.prod(params["scheduler"].common.alphas[t0:tMax])
-            xt = jnp.sqrt(alpha_vec) * x0 + \
-                jnp.sqrt(1-alpha_vec) * eps
             return xt
-    def DDIM_backward(self, params, num_inference_steps, timesteps, skip_t, t0, t1, do_classifier_free_guidance, text_embeddings, latents_local,
-                        guidance_scale, controlnet_image=None, controlnet_conditioning_scale=None):
-        scheduler_state = self.scheduler.set_timesteps(params["scheduler"], num_inference_steps)
         f = latents_local.shape[2]
-        latents_local = rearrange(latents_local, "b c f w h -> (b f) c w h")
         latents = latents_local.copy()
         x_t0_1 = None
         x_t1_1 = None
-        max_timestep = len(timesteps)-1
         timesteps = jnp.array(timesteps)
         def while_body(args):
             step, latents, x_t0_1, x_t1_1, scheduler_state = args
             t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
-            latent_model_input = jnp.concatenate(
-                [latents] * 2) if do_classifier_free_guidance else latents
             latent_model_input = self.scheduler.scale_model_input(
                 scheduler_state, latent_model_input, timestep=t
             )
             f = latents.shape[0]
-            te = jnp.stack([text_embeddings[0, :, :]]*f + [text_embeddings[-1,:,:]]*f)
             timestep = jnp.broadcast_to(t, latent_model_input.shape[0])
             if controlnet_image is not None:
                 down_block_res_samples, mid_block_res_sample = self.controlnet.apply(
@@ -185,41 +224,53 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
                     jnp.array(latent_model_input),
                     jnp.array(timestep, dtype=jnp.int32),
                     encoder_hidden_states=te,
-                    ).sample
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = jnp.split(noise_pred, 2, axis=0)
-                noise_pred = noise_pred_uncond + guidance_scale * \
-                    (noise_pred_text - noise_pred_uncond)
             # compute the previous noisy sample x_t -> x_t-1
-            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
-            x_t0_1 = jax.lax.select((step < max_timestep-1) & (timesteps[step+1] == t0), latents, x_t0_1)
-            x_t1_1 = jax.lax.select((step < max_timestep-1) & (timesteps[step+1] == t1), latents, x_t1_1)
             return (step + 1, latents, x_t0_1, x_t1_1, scheduler_state)
         latents_shape = latents.shape
         x_t0_1, x_t1_1 = jnp.zeros(latents_shape), jnp.zeros(latents_shape)
         def cond_fun(arg):
             step, latents, x_t0_1, x_t1_1, scheduler_state = arg
             return (step < skip_t) & (step < num_inference_steps)
         if DEBUG:
             step = 0
             while cond_fun((step, latents, x_t0_1, x_t1_1)):
-                step, latents, x_t0_1, x_t1_1, scheduler_state = while_body((step, latents, x_t0_1, x_t1_1, scheduler_state))
                 step = step + 1
         else:
-            _, latents, x_t0_1, x_t1_1, scheduler_state = jax.lax.while_loop(cond_fun, while_body, (0, latents, x_t0_1, x_t1_1, scheduler_state))
-        latents = rearrange(latents, "(b f) c w h -> b c f  w h", f=f)
         res = {"x0": latents.copy()}
         if x_t0_1 is not None:
-            x_t0_1 = rearrange(x_t0_1, "(b f) c w h -> b c f  w h", f=f)
             res["x_t0_1"] = x_t0_1.copy()
         if x_t1_1 is not None:
-            x_t1_1 = rearrange(x_t1_1, "(b f) c w h -> b c f  w h", f=f)
             res["x_t1_1"] = x_t1_1.copy()
         return res
     def warp_latents_independently(self, latents, reference_flow):
         _, _, H, W = reference_flow.shape
         b, _, f, h, w = latents.shape
@@ -230,10 +281,10 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         coords_t0 = coords_t0.at[:, 1].set(coords_t0[:, 1] * h / H)
         f, c, _, _ = coords_t0.shape
         coords_t0 = jax.image.resize(coords_t0, (f, c, h, w), "linear")
-        coords_t0 = rearrange(coords_t0, 'f c h w -> f h w c')
-        latents_0 = rearrange(latents[0], 'c f h w -> f  c  h w')
         warped = grid_sample(latents_0, coords_t0, "mirror")
-        warped = rearrange(warped, '(b f) c h w -> b c f h w', f=f)
         return warped
     def warp_vid_independently(self, vid, reference_flow):
@@ -245,74 +296,173 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         coords_t0 = coords_t0.at[:, 1].set(coords_t0[:, 1] * h / H)
         f, c, _, _ = coords_t0.shape
         coords_t0 = jax.image.resize(coords_t0, (f, c, h, w), "linear")
-        coords_t0 = rearrange(coords_t0, 'f c h w -> f h w c')
         # latents_0 = rearrange(vid, 'c f h w -> f  c  h w')
         warped = grid_sample(vid, coords_t0, "zeropad")
         # warped = rearrange(warped, 'f c h w -> b c f h w', f=f)
         return warped
-    def create_motion_field(self, motion_field_strength_x, motion_field_strength_y, frame_ids, video_length, latents):
-        reference_flow = jnp.zeros(
-            (video_length-1, 2, 512, 512), dtype=latents.dtype)
         for fr_idx, frame_id in enumerate(frame_ids):
-            reference_flow = reference_flow.at[fr_idx, 0, :,
-                           :].set(motion_field_strength_x*(frame_id))
-            reference_flow = reference_flow.at[fr_idx, 1, :,
-                           :].set(motion_field_strength_y*(frame_id))
         return reference_flow
-    def create_motion_field_and_warp_latents(self, motion_field_strength_x, motion_field_strength_y, frame_ids, video_length, latents):
-        motion_field = self.create_motion_field(motion_field_strength_x=motion_field_strength_x,
-                                                motion_field_strength_y=motion_field_strength_y, latents=latents, video_length=video_length, frame_ids=frame_ids)
         for idx, latent in enumerate(latents):
-            latents = latents.at[idx].set(self.warp_latents_independently(
-                latent[None], motion_field)[0])
         return motion_field, latents
-    def text_to_video_zero(self, params,
-                           prng,
-                           text_embeddings,
-                           video_length: Optional[int],
-                           do_classifier_free_guidance = True,
-                           height: Optional[int] = None,
-                           width: Optional[int] = None,
-                           num_inference_steps: int = 50,
-                           guidance_scale: float = 7.5,
-                           num_videos_per_prompt: Optional[int] = 1,
-                           xT = None,
-                           motion_field_strength_x: float = 12,
-                           motion_field_strength_y: float = 12,
-                           t0: int = 44,
-                           t1: int = 47,
-                           controlnet_image=None,
-                           controlnet_conditioning_scale=0,
-                           ):
         frame_ids = list(range(video_length))
         # Prepare timesteps
-        params["scheduler"] = self.scheduler.set_timesteps(params["scheduler"], num_inference_steps)
         timesteps = params["scheduler"].timesteps
         # Prepare latent variables
         num_channels_latents = self.unet.in_channels
         batch_size = 1
-        xT = prepare_latents(params, prng, batch_size * num_videos_per_prompt, num_channels_latents, 1, height, width, self.vae_scale_factor, xT)
-        xT = xT[:, :, :1]
-        timesteps_ddpm = [981, 961, 941, 921, 901, 881, 861, 841, 821, 801, 781, 761, 741, 721,
-                            701, 681, 661, 641, 621, 601, 581, 561, 541, 521, 501, 481, 461, 441,
-                            421, 401, 381, 361, 341, 321, 301, 281, 261, 241, 221, 201, 181, 161,
-                            141, 121, 101,  81,  61,  41,  21,   1]
         timesteps_ddpm.reverse()
         t0 = timesteps_ddpm[t0]
         t1 = timesteps_ddpm[t1]
         x_t1_1 = None
         # Denoising loop
-        shape = (batch_size, num_channels_latents, 1, height //
-                self.vae.scaling_factor, width // self.vae.scaling_factor)
         #  perform ∆t backward steps by stable diffusion
-        ddim_res = self.DDIM_backward(params, num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=1000, t0=t0, t1=t1, do_classifier_free_guidance=do_classifier_free_guidance,
-                                text_embeddings=text_embeddings, latents_local=xT, guidance_scale=guidance_scale,
-                                controlnet_image=jnp.stack([controlnet_image[0]] * 2), controlnet_conditioning_scale=controlnet_conditioning_scale)
         x0 = ddim_res["x0"]
         # apply warping functions
@@ -320,37 +470,524 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
             x_t0_1 = ddim_res["x_t0_1"]
         if "x_t1_1" in ddim_res:
             x_t1_1 = ddim_res["x_t1_1"]
-        x_t0_k = x_t0_1[:, :, :1, :, :].repeat(video_length-1, 2)
         reference_flow, x_t0_k = self.create_motion_field_and_warp_latents(
-            motion_field_strength_x=motion_field_strength_x, motion_field_strength_y=motion_field_strength_y, latents=x_t0_k, video_length=video_length, frame_ids=frame_ids[1:])
         # assuming t0=t1=1000, if t0 = 1000
         # DDPM forward for more motion freedom
-        ddpm_fwd = partial(self.DDPM_forward, params=params, prng=prng, x0=x_t0_k, t0=t0,
-                           tMax=t1, shape=shape, text_embeddings=text_embeddings)
-        x_t1_k = jax.lax.cond(t1 > t0,
-                              ddpm_fwd,
-                              lambda:x_t0_k
         )
-        x_t1 = jnp.concatenate([x_t1_1, x_t1_k], axis=2).copy()
         # backward stepts by stable diffusion
-        #warp the controlnet image following the same flow defined for latent
         controlnet_video = controlnet_image[:video_length]
-        controlnet_video = controlnet_video.at[1:].set(self.warp_vid_independently(controlnet_video[1:], reference_flow))
-        controlnet_image = jnp.concatenate([controlnet_video]*2)
-        ddim_res = self.DDIM_backward(params, num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=t1, t0=-1, t1=-1, do_classifier_free_guidance=do_classifier_free_guidance,
-                                            text_embeddings=text_embeddings, latents_local=x_t1, guidance_scale=guidance_scale,
-                                            controlnet_image=controlnet_image, controlnet_conditioning_scale=controlnet_conditioning_scale)
         x0 = ddim_res["x0"]
         return x0
     def prepare_text_inputs(self, prompt: Union[str, List[str]]):
         if not isinstance(prompt, (str, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         text_input = self.tokenizer(
             prompt,
             padding="max_length",
@@ -359,27 +996,38 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
             return_tensors="np",
         )
         return text_input.input_ids
     def prepare_image_inputs(self, image: Union[Image.Image, List[Image.Image]]):
         if not isinstance(image, (Image.Image, list)):
-            raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
         if isinstance(image, Image.Image):
             image = [image]
-        processed_images = jnp.concatenate([preprocess(img, jnp.float32) for img in image])
         return processed_images
     def _get_has_nsfw_concepts(self, features, params):
         has_nsfw_concepts = self.safety_checker(features, params)
         return has_nsfw_concepts
     def _run_safety_checker(self, images, safety_model_params, jit=False):
         # safety_model_params should already be replicated when jit is True
         pil_images = [Image.fromarray(image) for image in images]
         features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
         if jit:
             features = shard(features)
-            has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
             has_nsfw_concepts = unshard(has_nsfw_concepts)
             safety_model_params = unreplicate(safety_model_params)
         else:
-            has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
         images_was_copied = False
         for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
             if has_nsfw_concept:
@@ -393,6 +1041,7 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
                     " instead. Try again with a different prompt and/or seed."
                 )
         return images, has_nsfw_concepts
     def _generate(
         self,
         prompt_ids: jnp.array,
@@ -404,7 +1053,8 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         latents: Optional[jnp.array] = None,
         neg_prompt_ids: Optional[jnp.array] = None,
         controlnet_conditioning_scale: float = 1.0,
-        xT = None,
         motion_field_strength_x: float = 12,
         motion_field_strength_y: float = 12,
         t0: int = 44,
@@ -413,7 +1063,9 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         height, width = image.shape[-2:]
         video_length = image.shape[0]
         if height % 64 != 0 or width % 64 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
         # get prompt text embeddings
         prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
         # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
@@ -422,30 +1074,47 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         max_length = prompt_ids.shape[-1]
         if neg_prompt_ids is None:
             uncond_input = self.tokenizer(
-                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
             ).input_ids
         else:
             uncond_input = neg_prompt_ids
-        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
         context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
         image = jnp.concatenate([image] * 2)
         seed_t2vz, prng_seed = jax.random.split(prng_seed)
-        #get the latent following text to video zero
-        latents = self.text_to_video_zero(params, seed_t2vz, text_embeddings=context, video_length=video_length,
-                                          height=height, width = width, num_inference_steps=num_inference_steps,
-                                          guidance_scale=guidance_scale, controlnet_image=image,
-                                          xT=xT, t0=t0, t1=t1,
-                                          motion_field_strength_x=motion_field_strength_x,
-                                          motion_field_strength_y=motion_field_strength_y,
-                                          controlnet_conditioning_scale=controlnet_conditioning_scale
-                                          )
         # scale and decode the image latents with vae
         latents = 1 / self.vae.config.scaling_factor * latents
         latents = rearrange(latents, "b c f h w -> (b f) c h w")
-        video = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
         video = (video / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
         return video
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
@@ -460,7 +1129,8 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         controlnet_conditioning_scale: Union[float, jnp.array] = 1.0,
         return_dict: bool = True,
         jit: bool = False,
-        xT = None,
         motion_field_strength_x: float = 3,
         motion_field_strength_y: float = 4,
         t0: int = 44,
@@ -517,7 +1187,9 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         if isinstance(controlnet_conditioning_scale, float):
             # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
             # shape information, as they may be sharded (when `jit` is `True`), or not.
-            controlnet_conditioning_scale = jnp.array([controlnet_conditioning_scale] * prompt_ids.shape[0])
             if len(prompt_ids.shape) > 2:
                 # Assume sharded
                 controlnet_conditioning_scale = controlnet_conditioning_scale[:, None]
@@ -534,6 +1206,7 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
                 neg_prompt_ids,
                 controlnet_conditioning_scale,
                 xT,
                 motion_field_strength_x,
                 motion_field_strength_y,
                 t0,
@@ -551,6 +1224,7 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
                 neg_prompt_ids,
                 controlnet_conditioning_scale,
                 xT,
                 motion_field_strength_x,
                 motion_field_strength_y,
                 t0,
@@ -560,8 +1234,12 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
             safety_params = params["safety_checker"]
             images_uint8_casted = (images * 255).round().astype("uint8")
             num_devices, batch_size = images.shape[:2]
-            images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
-            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
             images = np.asarray(images)
             # block images
             if any(has_nsfw_concept):
@@ -574,17 +1252,21 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
             has_nsfw_concept = False
         if not return_dict:
             return (images, has_nsfw_concept)
-        return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
 # Static argnums are pipe, num_inference_steps. A change would trigger recompilation.
 # Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
 @partial(
     jax.pmap,
-    in_axes=(None, 0, 0, 0, 0, None, 0, 0, 0, 0, 0, None, None, None, None),
-    static_broadcasted_argnums=(0, 5, 11, 12, 13, 14),
 )
 def _p_generate(
     pipe,
-    prompt_ids,
     image,
     params,
     prng_seed,
@@ -594,6 +1276,7 @@ def _p_generate(
     neg_prompt_ids,
     controlnet_conditioning_scale,
     xT,
     motion_field_strength_x,
     motion_field_strength_y,
     t0,
@@ -610,19 +1293,26 @@ def _p_generate(
         neg_prompt_ids,
         controlnet_conditioning_scale,
         xT,
         motion_field_strength_x,
         motion_field_strength_y,
         t0,
         t1,
     )
 @partial(jax.pmap, static_broadcasted_argnums=(0,))
 def _p_get_has_nsfw_concepts(pipe, features, params):
     return pipe._get_has_nsfw_concepts(features, params)
 def unshard(x: jnp.ndarray):
     # einops.rearrange(x, 'd b ... -> (d b) ...')
     num_devices, batch_size = x.shape[:2]
     rest = x.shape[2:]
     return x.reshape(num_devices * batch_size, *rest)
 def preprocess(image, dtype):
     image = image.convert("RGB")
     w, h = image.size
@@ -632,43 +1322,98 @@ def preprocess(image, dtype):
     image = image[None].transpose(0, 3, 1, 2)
     return image
-def prepare_latents(params, prng, batch_size, num_channels_latents, video_length, height, width, vae_scale_factor, latents=None):
-    shape = (batch_size, num_channels_latents, video_length, height //
-            vae_scale_factor, width // vae_scale_factor)
     # scale the initial noise by the standard deviation required by the scheduler
     if latents is None:
         latents = jax.random.normal(prng, shape)
     latents = latents * params["scheduler"].init_noise_sigma
     return latents
 def coords_grid(batch, ht, wd):
     coords = jnp.meshgrid(jnp.arange(ht), jnp.arange(wd), indexing="ij")
     coords = jnp.stack(coords[::-1], axis=0)
     return coords[None].repeat(batch, 0)
 def adapt_pos_mirror(x, y, W, H):
-  #adapt the position, with mirror padding
-  x_w_mirror = ((x + W - 1) % (2*(W - 1))) - W + 1
-  x_adapted = jnp.where(x_w_mirror > 0, x_w_mirror, - (x_w_mirror))
-  y_w_mirror = ((y + H - 1) % (2*(H - 1))) - H + 1
-  y_adapted = jnp.where(y_w_mirror > 0, y_w_mirror, - (y_w_mirror))
-  return y_adapted, x_adapted
-def safe_get_zeropad(img, x,y,W,H):
-  return jnp.where((x < W) & (x > 0) & (y < H) & (y > 0), img[y,x], 0.)
-def safe_get_mirror(img, x,y,W,H):
-  return img[adapt_pos_mirror(x,y,W,H)]
 @partial(jax.vmap, in_axes=(0, 0, None))
 @partial(jax.vmap, in_axes=(0, None, None))
-@partial(jax.vmap, in_axes=(None,0, None))
 @partial(jax.vmap, in_axes=(None, 0, None))
 def grid_sample(latents, grid, method):
     # this is an alternative to torch.functional.nn.grid_sample in jax
     # this implementation is following the algorithm described @ https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html
     # but with coordinates scaled to the size of the image
     if method == "mirror":
-      return safe_get_mirror(latents, jnp.array(grid[0], dtype=jnp.int16), jnp.array(grid[1], dtype=jnp.int16), latents.shape[0], latents.shape[1])
-    else: #default is zero padding
-      return safe_get_zeropad(latents, jnp.array(grid[0], dtype=jnp.int16), jnp.array(grid[1], dtype=jnp.int16), latents.shape[0], latents.shape[1])

 import numpy as np
 from flax.core.frozen_dict import FrozenDict
 from flax.jax_utils import unreplicate
+from flax import jax_utils
 from flax.training.common_utils import shard
 from PIL import Image
 from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel
 from einops import rearrange, repeat
+from diffusers.models import (
+    FlaxAutoencoderKL,
+    FlaxControlNetModel,
+    FlaxUNet2DConditionModel,
+)
 from diffusers.schedulers import (
     FlaxDDIMScheduler,
     FlaxDPMSolverMultistepScheduler,
 from diffusers.utils import PIL_INTERPOLATION, logging, replace_example_docstring
 from diffusers.pipelines.pipeline_flax_utils import FlaxDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker_flax import (
+    FlaxStableDiffusionSafetyChecker,
+)
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 """
 Text2Video-Zero:
  - Inputs: Prompt, Pose Control via mp4/gif, First Frame (?)
  - JAX implementation
  - 3DUnet to replace 2DUnetConditional
 """
+def replicate_devices(array):
+    return jnp.expand_dims(array, 0).repeat(jax.device_count(), 0)
+DEBUG = False  # Set to True to use python for loop instead of jax.fori_loop for easier debugging
 EXAMPLE_DOC_STRING = """
     Examples:
         >>> output_images.save("generated_image.png")
         ```
 """
 class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
     def __init__(
         self,
+        vae,
+        text_encoder,
+        tokenizer,
+        unet,
+        unet_vanilla,
+        controlnet,
         scheduler: Union[
+            FlaxDDIMScheduler,
+            FlaxPNDMScheduler,
+            FlaxLMSDiscreteScheduler,
+            FlaxDPMSolverMultistepScheduler,
         ],
         safety_checker: FlaxStableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
             text_encoder=text_encoder,
             tokenizer=tokenizer,
             unet=unet,
+            unet_vanilla=unet_vanilla,
             controlnet=controlnet,
             scheduler=scheduler,
             safety_checker=safety_checker,
         else:
             eps = jax.random.normal(prng, x0.shape, dtype=text_embeddings.dtype)
             alpha_vec = jnp.prod(params["scheduler"].common.alphas[t0:tMax])
+            xt = jnp.sqrt(alpha_vec) * x0 + jnp.sqrt(1 - alpha_vec) * eps
             return xt
+    def DDIM_backward(
+        self,
+        params,
+        num_inference_steps,
+        timesteps,
+        skip_t,
+        t0,
+        t1,
+        do_classifier_free_guidance,
+        text_embeddings,
+        latents_local,
+        guidance_scale,
+        controlnet_image=None,
+        controlnet_conditioning_scale=None,
+    ):
+        scheduler_state = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps
+        )
         f = latents_local.shape[2]
+        latents_local = rearrange(latents_local, "b c f h w -> (b f) c h w")
         latents = latents_local.copy()
         x_t0_1 = None
         x_t1_1 = None
+        max_timestep = len(timesteps) - 1
         timesteps = jnp.array(timesteps)
         def while_body(args):
             step, latents, x_t0_1, x_t1_1, scheduler_state = args
             t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            latent_model_input = (
+                jnp.concatenate([latents] * 2)
+                if do_classifier_free_guidance
+                else latents
+            )
             latent_model_input = self.scheduler.scale_model_input(
                 scheduler_state, latent_model_input, timestep=t
             )
             f = latents.shape[0]
+            te = jnp.stack(
+                [text_embeddings[0, :, :]] * f + [text_embeddings[-1, :, :]] * f
+            )
             timestep = jnp.broadcast_to(t, latent_model_input.shape[0])
             if controlnet_image is not None:
                 down_block_res_samples, mid_block_res_sample = self.controlnet.apply(
                     jnp.array(latent_model_input),
                     jnp.array(timestep, dtype=jnp.int32),
                     encoder_hidden_states=te,
+                ).sample
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = jnp.split(noise_pred, 2, axis=0)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
             # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(
+                scheduler_state, noise_pred, t, latents
+            ).to_tuple()
+            x_t0_1 = jax.lax.select(
+                (step < max_timestep - 1) & (timesteps[step + 1] == t0), latents, x_t0_1
+            )
+            x_t1_1 = jax.lax.select(
+                (step < max_timestep - 1) & (timesteps[step + 1] == t1), latents, x_t1_1
+            )
             return (step + 1, latents, x_t0_1, x_t1_1, scheduler_state)
         latents_shape = latents.shape
         x_t0_1, x_t1_1 = jnp.zeros(latents_shape), jnp.zeros(latents_shape)
         def cond_fun(arg):
             step, latents, x_t0_1, x_t1_1, scheduler_state = arg
             return (step < skip_t) & (step < num_inference_steps)
         if DEBUG:
             step = 0
             while cond_fun((step, latents, x_t0_1, x_t1_1)):
+                step, latents, x_t0_1, x_t1_1, scheduler_state = while_body(
+                    (step, latents, x_t0_1, x_t1_1, scheduler_state)
+                )
                 step = step + 1
         else:
+            _, latents, x_t0_1, x_t1_1, scheduler_state = jax.lax.while_loop(
+                cond_fun, while_body, (0, latents, x_t0_1, x_t1_1, scheduler_state)
+            )
+        latents = rearrange(latents, "(b f) c h w -> b c f h w", f=f)
         res = {"x0": latents.copy()}
         if x_t0_1 is not None:
+            x_t0_1 = rearrange(x_t0_1, "(b f) c h w -> b c f  h w", f=f)
             res["x_t0_1"] = x_t0_1.copy()
         if x_t1_1 is not None:
+            x_t1_1 = rearrange(x_t1_1, "(b f) c h w -> b c f  h w", f=f)
             res["x_t1_1"] = x_t1_1.copy()
         return res
     def warp_latents_independently(self, latents, reference_flow):
         _, _, H, W = reference_flow.shape
         b, _, f, h, w = latents.shape
         coords_t0 = coords_t0.at[:, 1].set(coords_t0[:, 1] * h / H)
         f, c, _, _ = coords_t0.shape
         coords_t0 = jax.image.resize(coords_t0, (f, c, h, w), "linear")
+        coords_t0 = rearrange(coords_t0, "f c h w -> f h w c")
+        latents_0 = rearrange(latents[0], "c f h w -> f  c  h w")
         warped = grid_sample(latents_0, coords_t0, "mirror")
+        warped = rearrange(warped, "(b f) c h w -> b c f h w", f=f)
         return warped
     def warp_vid_independently(self, vid, reference_flow):
         coords_t0 = coords_t0.at[:, 1].set(coords_t0[:, 1] * h / H)
         f, c, _, _ = coords_t0.shape
         coords_t0 = jax.image.resize(coords_t0, (f, c, h, w), "linear")
+        coords_t0 = rearrange(coords_t0, "f c h w -> f h w c")
         # latents_0 = rearrange(vid, 'c f h w -> f  c  h w')
         warped = grid_sample(vid, coords_t0, "zeropad")
         # warped = rearrange(warped, 'f c h w -> b c f h w', f=f)
         return warped
+    def create_motion_field(
+        self,
+        motion_field_strength_x,
+        motion_field_strength_y,
+        frame_ids,
+        video_length,
+        latents,
+    ):
+        reference_flow = jnp.zeros((video_length - 1, 2, 512, 512), dtype=latents.dtype)
         for fr_idx, frame_id in enumerate(frame_ids):
+            reference_flow = reference_flow.at[fr_idx, 0, :, :].set(
+                motion_field_strength_x * (frame_id)
+            )
+            reference_flow = reference_flow.at[fr_idx, 1, :, :].set(
+                motion_field_strength_y * (frame_id)
+            )
         return reference_flow
+    def create_motion_field_and_warp_latents(
+        self,
+        motion_field_strength_x,
+        motion_field_strength_y,
+        frame_ids,
+        video_length,
+        latents,
+    ):
+        motion_field = self.create_motion_field(
+            motion_field_strength_x=motion_field_strength_x,
+            motion_field_strength_y=motion_field_strength_y,
+            latents=latents,
+            video_length=video_length,
+            frame_ids=frame_ids,
+        )
         for idx, latent in enumerate(latents):
+            latents = latents.at[idx].set(
+                self.warp_latents_independently(latent[None], motion_field)[0]
+            )
         return motion_field, latents
+    def text_to_video_zero(
+        self,
+        params,
+        prng,
+        text_embeddings,
+        video_length: Optional[int],
+        do_classifier_free_guidance=True,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_videos_per_prompt: Optional[int] = 1,
+        xT=None,
+        smooth_bg_strength: float = 0.0,
+        motion_field_strength_x: float = 12,
+        motion_field_strength_y: float = 12,
+        t0: int = 44,
+        t1: int = 47,
+        controlnet_image=None,
+        controlnet_conditioning_scale=0,
+    ):
         frame_ids = list(range(video_length))
         # Prepare timesteps
+        params["scheduler"] = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps
+        )
         timesteps = params["scheduler"].timesteps
         # Prepare latent variables
         num_channels_latents = self.unet.in_channels
         batch_size = 1
+        xT = prepare_latents(
+            params,
+            prng,
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            self.vae_scale_factor,
+            xT,
+        )
+        timesteps_ddpm = [
+            981,
+            961,
+            941,
+            921,
+            901,
+            881,
+            861,
+            841,
+            821,
+            801,
+            781,
+            761,
+            741,
+            721,
+            701,
+            681,
+            661,
+            641,
+            621,
+            601,
+            581,
+            561,
+            541,
+            521,
+            501,
+            481,
+            461,
+            441,
+            421,
+            401,
+            381,
+            361,
+            341,
+            321,
+            301,
+            281,
+            261,
+            241,
+            221,
+            201,
+            181,
+            161,
+            141,
+            121,
+            101,
+            81,
+            61,
+            41,
+            21,
+            1,
+        ]
         timesteps_ddpm.reverse()
         t0 = timesteps_ddpm[t0]
         t1 = timesteps_ddpm[t1]
         x_t1_1 = None
         # Denoising loop
+        shape = (
+            batch_size,
+            num_channels_latents,
+            1,
+            height // self.vae.scaling_factor,
+            width // self.vae.scaling_factor,
+        )
         #  perform ∆t backward steps by stable diffusion
+        ddim_res = self.DDIM_backward(
+            params,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            skip_t=1000,
+            t0=t0,
+            t1=t1,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            text_embeddings=text_embeddings,
+            latents_local=xT,
+            guidance_scale=guidance_scale,
+            controlnet_image=jnp.stack([controlnet_image[0]] * 2),
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+        )
         x0 = ddim_res["x0"]
         # apply warping functions
             x_t0_1 = ddim_res["x_t0_1"]
         if "x_t1_1" in ddim_res:
             x_t1_1 = ddim_res["x_t1_1"]
+        x_t0_k = x_t0_1[:, :, :1, :, :].repeat(video_length - 1, 2)
         reference_flow, x_t0_k = self.create_motion_field_and_warp_latents(
+            motion_field_strength_x=motion_field_strength_x,
+            motion_field_strength_y=motion_field_strength_y,
+            latents=x_t0_k,
+            video_length=video_length,
+            frame_ids=frame_ids[1:],
+        )
         # assuming t0=t1=1000, if t0 = 1000
         # DDPM forward for more motion freedom
+        ddpm_fwd = partial(
+            self.DDPM_forward,
+            params=params,
+            prng=prng,
+            x0=x_t0_k,
+            t0=t0,
+            tMax=t1,
+            shape=shape,
+            text_embeddings=text_embeddings,
         )
+        x_t1_k = jax.lax.cond(t1 > t0, ddpm_fwd, lambda: x_t0_k)
+        x_t1 = jnp.concatenate([x_t1_1, x_t1_k], axis=2)
         # backward stepts by stable diffusion
+        # warp the controlnet image following the same flow defined for latent
         controlnet_video = controlnet_image[:video_length]
+        controlnet_video = controlnet_video.at[1:].set(
+            self.warp_vid_independently(controlnet_video[1:], reference_flow)
+        )
+        controlnet_image = jnp.concatenate([controlnet_video] * 2)
+        smooth_bg = True
+        if smooth_bg:
+            # latent shape: "b c f h w"
+            M_FG = repeat(
+                get_mask_pose(controlnet_video),
+                "f h w -> b c f h w",
+                c=x_t1.shape[1],
+                b=batch_size,
+            )
+            initial_bg = repeat(
+                x_t1[:, :, 0] * (1 - M_FG[:, :, 0]),
+                "b c h w -> b c f h w",
+                f=video_length - 1,
+            )
+            # warp the controlnet image following the same flow defined for latent #f c h w
+            initial_bg_warped = self.warp_latents_independently(
+                initial_bg, reference_flow
+            )
+            bgs = x_t1[:, :, 1:] * (1 - M_FG[:, :, 1:])  # initial background
+            initial_mask_warped = 1 - self.warp_latents_independently(
+                repeat(M_FG[:, :, 0], "b c h w -> b c f h w", f=video_length - 1),
+                reference_flow,
+            )
+            # initial_mask_warped = 1 - warp_vid_independently(repeat(M_FG[:,:,0], "b c h w -> (b f) c h w", f = video_length-1), reference_flow)
+            # initial_mask_warped = rearrange(initial_mask_warped, "(b f) c h w -> b c f h w", b=batch_size)
+            mask = (1 - M_FG[:, :, 1:]) * initial_mask_warped
+            x_t1 = x_t1.at[:, :, 1:].set(
+                (1 - mask) * x_t1[:, :, 1:]
+                + mask
+                * (
+                    initial_bg_warped * smooth_bg_strength
+                    + (1 - smooth_bg_strength) * bgs
+                )
+            )
+        ddim_res = self.DDIM_backward(
+            params,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            skip_t=t1,
+            t0=-1,
+            t1=-1,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            text_embeddings=text_embeddings,
+            latents_local=x_t1,
+            guidance_scale=guidance_scale,
+            controlnet_image=controlnet_image,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+        )
         x0 = ddim_res["x0"]
+        del ddim_res
+        del x_t1
+        del x_t1_1
+        del x_t1_k
         return x0
+    def denoise_latent(
+        self,
+        params,
+        num_inference_steps,
+        timesteps,
+        do_classifier_free_guidance,
+        text_embeddings,
+        latents,
+        guidance_scale,
+        controlnet_image=None,
+        controlnet_conditioning_scale=None,
+    ):
+        scheduler_state = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps
+        )
+        # f = latents_local.shape[2]
+        # latents_local = rearrange(latents_local, "b c f h w -> (b f) c h w")
+        max_timestep = len(timesteps) - 1
+        timesteps = jnp.array(timesteps)
+        def while_body(args):
+            step, latents, scheduler_state = args
+            t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            latent_model_input = (
+                jnp.concatenate([latents] * 2)
+                if do_classifier_free_guidance
+                else latents
+            )
+            latent_model_input = self.scheduler.scale_model_input(
+                scheduler_state, latent_model_input, timestep=t
+            )
+            f = latents.shape[0]
+            te = jnp.stack(
+                [text_embeddings[0, :, :]] * f + [text_embeddings[-1, :, :]] * f
+            )
+            timestep = jnp.broadcast_to(t, latent_model_input.shape[0])
+            if controlnet_image is not None:
+                down_block_res_samples, mid_block_res_sample = self.controlnet.apply(
+                    {"params": params["controlnet"]},
+                    jnp.array(latent_model_input),
+                    jnp.array(timestep, dtype=jnp.int32),
+                    encoder_hidden_states=te,
+                    controlnet_cond=controlnet_image,
+                    conditioning_scale=controlnet_conditioning_scale,
+                    return_dict=False,
+                )
+                # predict the noise residual
+                noise_pred = self.unet_vanilla.apply(
+                    {"params": params["unet"]},
+                    jnp.array(latent_model_input),
+                    jnp.array(timestep, dtype=jnp.int32),
+                    encoder_hidden_states=te,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
+            else:
+                noise_pred = self.unet_vanilla.apply(
+                    {"params": params["unet"]},
+                    jnp.array(latent_model_input),
+                    jnp.array(timestep, dtype=jnp.int32),
+                    encoder_hidden_states=te,
+                ).sample
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = jnp.split(noise_pred, 2, axis=0)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(
+                scheduler_state, noise_pred, t, latents
+            ).to_tuple()
+            return (step + 1, latents, scheduler_state)
+        def cond_fun(arg):
+            step, latents, scheduler_state = arg
+            return step < num_inference_steps
+        if DEBUG:
+            step = 0
+            while cond_fun((step, latents, scheduler_state)):
+                step, latents, scheduler_state = while_body(
+                    (step, latents, scheduler_state)
+                )
+                step = step + 1
+        else:
+            _, latents, scheduler_state = jax.lax.while_loop(
+                cond_fun, while_body, (0, latents, scheduler_state)
+            )
+        # latents = rearrange(latents, "(b f) c h w -> b c f h w", f=f)
+        return latents
+    @partial(jax.jit, static_argnums=(0, 1))
+    def _generate_starting_frames(
+        self,
+        num_inference_steps,
+        params,
+        timesteps,
+        text_embeddings,
+        latents,
+        guidance_scale,
+        controlnet_image,
+        controlnet_conditioning_scale,
+    ):
+        #  perform ∆t backward steps by stable diffusion
+        # delta_t_diffusion = jax.vmap(lambda latent : self.DDIM_backward(params, num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=1000, t0=t0, t1=t1, do_classifier_free_guidance=do_classifier_free_guidance,
+        #                                     text_embeddings=text_embeddings, latents_local=latent, guidance_scale=guidance_scale,
+        #                                     controlnet_image=controlnet_image, controlnet_conditioning_scale=controlnet_conditioning_scale))
+        # ddim_res = delta_t_diffusion(latents)
+        # latents = ddim_res["x0"] #output is  i b c f h w
+        # DDPM forward for more motion freedom
+        # ddpm_fwd = jax.vmap(lambda prng, latent: self.DDPM_forward(params=params, prng=prng, x0=latent, t0=t0,
+        #                 tMax=t1, shape=shape, text_embeddings=text_embeddings))
+        # latents = ddpm_fwd(stacked_prngs, latents)
+        # main backward diffusion
+        # denoise_first_frame = lambda latent : self.DDIM_backward(params, num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=100000, t0=-1, t1=-1, do_classifier_free_guidance=do_classifier_free_guidance,
+        #                                     text_embeddings=text_embeddings, latents_local=latent, guidance_scale=guidance_scale,
+        #                                     controlnet_image=controlnet_image, controlnet_conditioning_scale=controlnet_conditioning_scale, use_vanilla=True)
+        # latents = rearrange(latents, 'i b c f h w -> (i b) c f h w')
+        # ddim_res = denoise_first_frame(latents)
+        latents = self.denoise_latent(
+            params,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            do_classifier_free_guidance=True,
+            text_embeddings=text_embeddings,
+            latents=latents,
+            guidance_scale=guidance_scale,
+            controlnet_image=controlnet_image,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+        )
+        # latents = rearrange(ddim_res["x0"], 'i b c f h w -> (i b) c f h w') #output is  i b c f h w
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        # latents = rearrange(latents, "b c h w -> (b f) c h w")
+        imgs = self.vae.apply(
+            {"params": params["vae"]}, latents, method=self.vae.decode
+        ).sample
+        imgs = (imgs / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+        return imgs
+    def generate_starting_frames(
+        self,
+        params,
+        prngs: list,  # list of prngs for each img
+        prompt,
+        neg_prompt,
+        controlnet_image,
+        do_classifier_free_guidance=True,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        t0: int = 44,
+        t1: int = 47,
+        controlnet_conditioning_scale=1.0,
+    ):
+        height, width = controlnet_image.shape[-2:]
+        if height % 64 != 0 or width % 64 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 64 but are {height} and {width}."
+            )
+        shape = (
+            self.unet.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )  # b c h w
+        # scale the initial noise by the standard deviation required by the scheduler
+        print(
+            f"Generating {len(prngs)} first frames with prompt {prompt}, for {num_inference_steps} steps. PRNG seeds are: {prngs}"
+        )
+        latents = jnp.stack(
+            [jax.random.normal(prng, shape) for prng in prngs]
+        )  # b c h w
+        latents = latents * params["scheduler"].init_noise_sigma
+        timesteps = params["scheduler"].timesteps
+        timesteps_ddpm = [
+            981,
+            961,
+            941,
+            921,
+            901,
+            881,
+            861,
+            841,
+            821,
+            801,
+            781,
+            761,
+            741,
+            721,
+            701,
+            681,
+            661,
+            641,
+            621,
+            601,
+            581,
+            561,
+            541,
+            521,
+            501,
+            481,
+            461,
+            441,
+            421,
+            401,
+            381,
+            361,
+            341,
+            321,
+            301,
+            281,
+            261,
+            241,
+            221,
+            201,
+            181,
+            161,
+            141,
+            121,
+            101,
+            81,
+            61,
+            41,
+            21,
+            1,
+        ]
+        timesteps_ddpm.reverse()
+        t0 = timesteps_ddpm[t0]
+        t1 = timesteps_ddpm[t1]
+        # get prompt text embeddings
+        prompt_ids = self.prepare_text_inputs(prompt)
+        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
+        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
+        batch_size = 1
+        max_length = prompt_ids.shape[-1]
+        if neg_prompt is None:
+            uncond_input = self.tokenizer(
+                [""] * batch_size,
+                padding="max_length",
+                max_length=max_length,
+                return_tensors="np",
+            ).input_ids
+        else:
+            neg_prompt_ids = self.prepare_text_inputs(neg_prompt)
+            uncond_input = neg_prompt_ids
+        negative_prompt_embeds = self.text_encoder(
+            uncond_input, params=params["text_encoder"]
+        )[0]
+        text_embeddings = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+        controlnet_image = jnp.stack([controlnet_image[0]] * 2 * len(prngs))
+        return self._generate_starting_frames(
+            num_inference_steps,
+            params,
+            timesteps,
+            text_embeddings,
+            latents,
+            guidance_scale,
+            controlnet_image,
+            controlnet_conditioning_scale,
+        )
+    def generate_video(
+        self,
+        prompt: str,
+        image: jnp.array,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.random.KeyArray,
+        num_inference_steps: int = 50,
+        guidance_scale: Union[float, jnp.array] = 7.5,
+        latents: jnp.array = None,
+        neg_prompt: str = "",
+        controlnet_conditioning_scale: Union[float, jnp.array] = 1.0,
+        return_dict: bool = True,
+        jit: bool = False,
+        xT=None,
+        smooth_bg_strength: float = 0.0,
+        motion_field_strength_x: float = 3,
+        motion_field_strength_y: float = 4,
+        t0: int = 44,
+        t1: int = 47,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt_ids (`jnp.array`):
+                The prompt or prompts to guide the image generation.
+            image (`jnp.array`):
+                Array representing the ControlNet input condition. ControlNet use this input condition to generate
+                guidance to Unet.
+            params (`Dict` or `FrozenDict`): Dictionary containing the model parameters/weights
+            prng_seed (`jax.random.KeyArray` or `jax.Array`): Array containing random number generator key
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            latents (`jnp.array`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            controlnet_conditioning_scale (`float` or `jnp.array`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
+                a plain tuple.
+            jit (`bool`, defaults to `False`):
+                Whether to run `pmap` versions of the generation and safety scoring functions. NOTE: This argument
+                exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a future release.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. When returning a tuple, the first element is a list with the generated images, and the second
+            element is a list of `bool`s denoting whether the corresponding generated image likely represents
+            "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
+        """
+        height, width = image.shape[-2:]
+        vid_length = image.shape[0]
+        # get prompt text embeddings
+        prompt_ids = self.prepare_text_inputs([prompt] * vid_length)
+        neg_prompt_ids = self.prepare_text_inputs([neg_prompt] * vid_length)
+        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
+        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
+        batch_size = 1
+        if isinstance(guidance_scale, float):
+            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+            # shape information, as they may be sharded (when `jit` is `True`), or not.
+            guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
+            if len(prompt_ids.shape) > 2:
+                # Assume sharded
+                guidance_scale = guidance_scale[:, None]
+        if isinstance(controlnet_conditioning_scale, float):
+            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+            # shape information, as they may be sharded (when `jit` is `True`), or not.
+            controlnet_conditioning_scale = jnp.array(
+                [controlnet_conditioning_scale] * prompt_ids.shape[0]
+            )
+            if len(prompt_ids.shape) > 2:
+                # Assume sharded
+                controlnet_conditioning_scale = controlnet_conditioning_scale[:, None]
+        if jit:
+            images = _p_generate(
+                self,
+                replicate_devices(prompt_ids),
+                replicate_devices(image),
+                jax_utils.replicate(params),
+                replicate_devices(prng_seed),
+                num_inference_steps,
+                replicate_devices(guidance_scale),
+                replicate_devices(latents) if latents is not None else None,
+                replicate_devices(neg_prompt_ids)
+                if neg_prompt_ids is not None
+                else None,
+                replicate_devices(controlnet_conditioning_scale),
+                replicate_devices(xT) if xT is not None else None,
+                replicate_devices(smooth_bg_strength),
+                replicate_devices(motion_field_strength_x),
+                replicate_devices(motion_field_strength_y),
+                t0,
+                t1,
+            )
+        else:
+            images = self._generate(
+                prompt_ids,
+                image,
+                params,
+                prng_seed,
+                num_inference_steps,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+                controlnet_conditioning_scale,
+                xT,
+                smooth_bg_strength,
+                motion_field_strength_x,
+                motion_field_strength_y,
+                t0,
+                t1,
+            )
+        if self.safety_checker is not None:
+            safety_params = params["safety_checker"]
+            images_uint8_casted = (images * 255).round().astype("uint8")
+            num_devices, batch_size = images.shape[:2]
+            images_uint8_casted = np.asarray(images_uint8_casted).reshape(
+                num_devices * batch_size, height, width, 3
+            )
+            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(
+                images_uint8_casted, safety_params, jit
+            )
+            images = np.asarray(images)
+            # block images
+            if any(has_nsfw_concept):
+                for i, is_nsfw in enumerate(has_nsfw_concept):
+                    if is_nsfw:
+                        images[i] = np.asarray(images_uint8_casted[i])
+            images = images.reshape(num_devices, batch_size, height, width, 3)
+        else:
+            images = np.asarray(images)
+            has_nsfw_concept = False
+        if not return_dict:
+            return (images, has_nsfw_concept)
+        return FlaxStableDiffusionPipelineOutput(
+            images=images, nsfw_content_detected=has_nsfw_concept
+        )
     def prepare_text_inputs(self, prompt: Union[str, List[str]]):
         if not isinstance(prompt, (str, list)):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
         text_input = self.tokenizer(
             prompt,
             padding="max_length",
             return_tensors="np",
         )
         return text_input.input_ids
     def prepare_image_inputs(self, image: Union[Image.Image, List[Image.Image]]):
         if not isinstance(image, (Image.Image, list)):
+            raise ValueError(
+                f"image has to be of type `PIL.Image.Image` or list but is {type(image)}"
+            )
         if isinstance(image, Image.Image):
             image = [image]
+        processed_images = jnp.concatenate(
+            [preprocess(img, jnp.float32) for img in image]
+        )
         return processed_images
     def _get_has_nsfw_concepts(self, features, params):
         has_nsfw_concepts = self.safety_checker(features, params)
         return has_nsfw_concepts
     def _run_safety_checker(self, images, safety_model_params, jit=False):
         # safety_model_params should already be replicated when jit is True
         pil_images = [Image.fromarray(image) for image in images]
         features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
         if jit:
             features = shard(features)
+            has_nsfw_concepts = _p_get_has_nsfw_concepts(
+                self, features, safety_model_params
+            )
             has_nsfw_concepts = unshard(has_nsfw_concepts)
             safety_model_params = unreplicate(safety_model_params)
         else:
+            has_nsfw_concepts = self._get_has_nsfw_concepts(
+                features, safety_model_params
+            )
         images_was_copied = False
         for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
             if has_nsfw_concept:
                     " instead. Try again with a different prompt and/or seed."
                 )
         return images, has_nsfw_concepts
     def _generate(
         self,
         prompt_ids: jnp.array,
         latents: Optional[jnp.array] = None,
         neg_prompt_ids: Optional[jnp.array] = None,
         controlnet_conditioning_scale: float = 1.0,
+        xT=None,
+        smooth_bg_strength: float = 0.0,
         motion_field_strength_x: float = 12,
         motion_field_strength_y: float = 12,
         t0: int = 44,
         height, width = image.shape[-2:]
         video_length = image.shape[0]
         if height % 64 != 0 or width % 64 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 64 but are {height} and {width}."
+            )
         # get prompt text embeddings
         prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
         # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
         max_length = prompt_ids.shape[-1]
         if neg_prompt_ids is None:
             uncond_input = self.tokenizer(
+                [""] * batch_size,
+                padding="max_length",
+                max_length=max_length,
+                return_tensors="np",
             ).input_ids
         else:
             uncond_input = neg_prompt_ids
+        negative_prompt_embeds = self.text_encoder(
+            uncond_input, params=params["text_encoder"]
+        )[0]
         context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
         image = jnp.concatenate([image] * 2)
         seed_t2vz, prng_seed = jax.random.split(prng_seed)
+        # get the latent following text to video zero
+        latents = self.text_to_video_zero(
+            params,
+            seed_t2vz,
+            text_embeddings=context,
+            video_length=video_length,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            controlnet_image=image,
+            xT=xT,
+            smooth_bg_strength=smooth_bg_strength,
+            t0=t0,
+            t1=t1,
+            motion_field_strength_x=motion_field_strength_x,
+            motion_field_strength_y=motion_field_strength_y,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+        )
         # scale and decode the image latents with vae
         latents = 1 / self.vae.config.scaling_factor * latents
         latents = rearrange(latents, "b c f h w -> (b f) c h w")
+        video = self.vae.apply(
+            {"params": params["vae"]}, latents, method=self.vae.decode
+        ).sample
         video = (video / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
         return video
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         controlnet_conditioning_scale: Union[float, jnp.array] = 1.0,
         return_dict: bool = True,
         jit: bool = False,
+        xT=None,
+        smooth_bg_strength: float = 0.0,
         motion_field_strength_x: float = 3,
         motion_field_strength_y: float = 4,
         t0: int = 44,
         if isinstance(controlnet_conditioning_scale, float):
             # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
             # shape information, as they may be sharded (when `jit` is `True`), or not.
+            controlnet_conditioning_scale = jnp.array(
+                [controlnet_conditioning_scale] * prompt_ids.shape[0]
+            )
             if len(prompt_ids.shape) > 2:
                 # Assume sharded
                 controlnet_conditioning_scale = controlnet_conditioning_scale[:, None]
                 neg_prompt_ids,
                 controlnet_conditioning_scale,
                 xT,
+                smooth_bg_strength,
                 motion_field_strength_x,
                 motion_field_strength_y,
                 t0,
                 neg_prompt_ids,
                 controlnet_conditioning_scale,
                 xT,
+                smooth_bg_strength,
                 motion_field_strength_x,
                 motion_field_strength_y,
                 t0,
             safety_params = params["safety_checker"]
             images_uint8_casted = (images * 255).round().astype("uint8")
             num_devices, batch_size = images.shape[:2]
+            images_uint8_casted = np.asarray(images_uint8_casted).reshape(
+                num_devices * batch_size, height, width, 3
+            )
+            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(
+                images_uint8_casted, safety_params, jit
+            )
             images = np.asarray(images)
             # block images
             if any(has_nsfw_concept):
             has_nsfw_concept = False
         if not return_dict:
             return (images, has_nsfw_concept)
+        return FlaxStableDiffusionPipelineOutput(
+            images=images, nsfw_content_detected=has_nsfw_concept
+        )
 # Static argnums are pipe, num_inference_steps. A change would trigger recompilation.
 # Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
 @partial(
     jax.pmap,
+    in_axes=(None, 0, 0, 0, 0, None, 0, 0, 0, 0, 0, 0, 0, 0, None, None),
+    static_broadcasted_argnums=(0, 5, 14, 15),
 )
 def _p_generate(
     pipe,
+    prompt_ids,
     image,
     params,
     prng_seed,
     neg_prompt_ids,
     controlnet_conditioning_scale,
     xT,
+    smooth_bg_strength,
     motion_field_strength_x,
     motion_field_strength_y,
     t0,
         neg_prompt_ids,
         controlnet_conditioning_scale,
         xT,
+        smooth_bg_strength,
         motion_field_strength_x,
         motion_field_strength_y,
         t0,
         t1,
     )
 @partial(jax.pmap, static_broadcasted_argnums=(0,))
 def _p_get_has_nsfw_concepts(pipe, features, params):
     return pipe._get_has_nsfw_concepts(features, params)
 def unshard(x: jnp.ndarray):
     # einops.rearrange(x, 'd b ... -> (d b) ...')
     num_devices, batch_size = x.shape[:2]
     rest = x.shape[2:]
     return x.reshape(num_devices * batch_size, *rest)
 def preprocess(image, dtype):
     image = image.convert("RGB")
     w, h = image.size
     image = image[None].transpose(0, 3, 1, 2)
     return image
+def prepare_latents(
+    params,
+    prng,
+    batch_size,
+    num_channels_latents,
+    height,
+    width,
+    vae_scale_factor,
+    latents=None,
+):
+    shape = (
+        batch_size,
+        num_channels_latents,
+        1,
+        height // vae_scale_factor,
+        width // vae_scale_factor,
+    )  # b c f h w
     # scale the initial noise by the standard deviation required by the scheduler
     if latents is None:
         latents = jax.random.normal(prng, shape)
     latents = latents * params["scheduler"].init_noise_sigma
     return latents
 def coords_grid(batch, ht, wd):
     coords = jnp.meshgrid(jnp.arange(ht), jnp.arange(wd), indexing="ij")
     coords = jnp.stack(coords[::-1], axis=0)
     return coords[None].repeat(batch, 0)
 def adapt_pos_mirror(x, y, W, H):
+    # adapt the position, with mirror padding
+    x_w_mirror = ((x + W - 1) % (2 * (W - 1))) - W + 1
+    x_adapted = jnp.where(x_w_mirror > 0, x_w_mirror, -(x_w_mirror))
+    y_w_mirror = ((y + H - 1) % (2 * (H - 1))) - H + 1
+    y_adapted = jnp.where(y_w_mirror > 0, y_w_mirror, -(y_w_mirror))
+    return y_adapted, x_adapted
+def safe_get_zeropad(img, x, y, W, H):
+    return jnp.where((x < W) & (x > 0) & (y < H) & (y > 0), img[y, x], 0.0)
+def safe_get_mirror(img, x, y, W, H):
+    return img[adapt_pos_mirror(x, y, W, H)]
 @partial(jax.vmap, in_axes=(0, 0, None))
 @partial(jax.vmap, in_axes=(0, None, None))
+@partial(jax.vmap, in_axes=(None, 0, None))
 @partial(jax.vmap, in_axes=(None, 0, None))
 def grid_sample(latents, grid, method):
     # this is an alternative to torch.functional.nn.grid_sample in jax
     # this implementation is following the algorithm described @ https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html
     # but with coordinates scaled to the size of the image
     if method == "mirror":
+        return safe_get_mirror(
+            latents,
+            jnp.array(grid[0], dtype=jnp.int16),
+            jnp.array(grid[1], dtype=jnp.int16),
+            latents.shape[0],
+            latents.shape[1],
+        )
+    else:  # default is zero padding
+        return safe_get_zeropad(
+            latents,
+            jnp.array(grid[0], dtype=jnp.int16),
+            jnp.array(grid[1], dtype=jnp.int16),
+            latents.shape[0],
+            latents.shape[1],
+        )
+def bandw_vid(vid, threshold):
+    vid = jnp.max(vid, axis=1)
+    return jnp.where(vid > threshold, 1, 0)
+def mean_blur(vid, k):
+    window = jnp.ones((vid.shape[0], k, k)) / (k * k)
+    convolve = jax.vmap(
+        lambda img, kernel: jax.scipy.signal.convolve(img, kernel, mode="same")
+    )
+    smooth_vid = convolve(vid, window)
+    return smooth_vid
+def get_mask_pose(vid):
+    vid = bandw_vid(vid, 0.4)
+    l, h, w = vid.shape
+    vid = jax.image.resize(vid, (l, h // 8, w // 8), "nearest")
+    vid = bandw_vid(mean_blur(vid, 7)[:, None], threshold=0.01)
+    return vid / (jnp.max(vid) + 1e-4)
+    # return jax.image.resize(vid/(jnp.max(vid) + 1e-4), (l, h, w), "nearest")

webui/app_control_animation.py CHANGED Viewed

@@ -19,112 +19,46 @@ examples = [
 ]
-images = []  # str path of generated images
-initial_frame = None
-animation_model = None
-def generate_initial_frames(
-    frames_prompt,
-    model_link,
-    is_safetensor,
-    frames_n_prompt,
-    width,
-    height,
-    cfg_scale,
-    seed,
-):
-    global images
-    if not model_link:
-        model_link = "dreamlike-art/dreamlike-photoreal-2.0"
-    images = animation_model.generate_initial_frames(
-        frames_prompt,
-        model_link,
-        is_safetensor,
-        frames_n_prompt,
-        width,
-        height,
-        cfg_scale,
-        seed,
-    )
-    return images
-def select_initial_frame(evt: gr.SelectData):
-    global initial_frame
-    if evt.index < len(images):
-        initial_frame = images[evt.index]
-        print(initial_frame)
 def create_demo(model: ControlAnimationModel):
-    global animation_model
-    animation_model = model
     with gr.Blocks() as demo:
-        with gr.Column(visible=True) as frame_selection_col:
             with gr.Row():
                 with gr.Column():
-                    frames_prompt = gr.Textbox(
-                        placeholder="Prompt", show_label=False, lines=4
                     )
-                    frames_n_prompt = gr.Textbox(
                         placeholder="Negative Prompt (optional)",
                         show_label=False,
                         lines=2,
                     )
-                with gr.Column():
-                    model_link = gr.Textbox(
-                        label="Model Link",
-                        placeholder="dreamlike-art/dreamlike-photoreal-2.0",
-                        info="Give the hugging face model name or URL link to safetensor.",
-                    )
-                    is_safetensor = gr.Checkbox(label="Safetensors")
                     gen_frames_button = gr.Button(
                         value="Generate Initial Frames", variant="primary"
                     )
-            with gr.Row():
-                with gr.Column(scale=2):
-                    width = gr.Slider(32, 2048, value=512, label="Width")
-                    height = gr.Slider(32, 2048, value=512, label="Height")
-                    cfg_scale = gr.Slider(1, 20, value=7.0, step=0.1, label="CFG scale")
-                    seed = gr.Slider(
-                        label="Seed",
-                        info="-1 for random seed on each run. Otherwise, the seed will be fixed.",
-                        minimum=-1,
-                        maximum=65536,
-                        value=0,
-                        step=1,
-                    )
-                with gr.Column(scale=3):
-                    initial_frames = gr.Gallery(
-                        label="Initial Frames", show_label=False
-                    ).style(columns=4, object_fit="contain")
-                    initial_frames.select(select_initial_frame)
-                    select_frame_button = gr.Button(
-                        value="Select Initial Frame", variant="secondary"
-                    )
-        with gr.Column(visible=False) as gen_animation_col:
-            with gr.Row():
-                with gr.Column():
-                    prompt = gr.Textbox(label="Prompt")
-                    gen_animation_button = gr.Button(
-                        value="Generate Animation", variant="primary"
-                    )
                     with gr.Accordion("Advanced options", open=False):
-                        n_prompt = gr.Textbox(
-                            label="Negative Prompt (optional)", value=""
-                        )
                         if on_huggingspace:
                             video_length = gr.Slider(
                                 label="Video length", minimum=8, maximum=16, step=1
@@ -197,68 +131,101 @@ def create_demo(model: ControlAnimationModel):
                         )
                 with gr.Column():
                     result = gr.Video(label="Generated Video")
-        inputs = [
             prompt,
             model_link,
-            is_safetensor,
             motion_field_strength_x,
             motion_field_strength_y,
             t0,
             t1,
-            n_prompt,
             chunk_size,
             video_length,
             merging_ratio,
             seed,
         ]
-        # gr.Examples(examples=examples,
-        #             inputs=inputs,
-        #             outputs=result,
-        #             fn=None,
-        #             run_on_click=False,
-        #             cache_examples=on_huggingspace,
-        # )
-        frame_inputs = [
-            frames_prompt,
-            model_link,
-            is_safetensor,
-            frames_n_prompt,
-            width,
-            height,
-            cfg_scale,
-            seed,
-        ]
-        def submit_select():
-            show = True
-            if initial_frame is not None:  # More to next step
                 return {
-                    frame_selection_col: gr.update(visible=not show),
-                    gen_animation_col: gr.update(visible=show),
                 }
             return {
-                frame_selection_col: gr.update(visible=show),
-                gen_animation_col: gr.update(visible=not show),
             }
         gen_frames_button.click(
-            generate_initial_frames,
             inputs=frame_inputs,
             outputs=initial_frames,
         )
-        select_frame_button.click(
-            submit_select, inputs=None, outputs=[frame_selection_col, gen_animation_col]
-        )
         gen_animation_button.click(
-            fn=model.process_text2video,
-            inputs=inputs,
             outputs=result,
         )
     return demo

 ]
+def on_video_path_update(evt: gr.EventData):
+    return f"Selection: **{evt._data}**"
+def pose_gallery_callback(evt: gr.SelectData):
+    return f"Motion {evt.index+1}"
+def get_frame_index(evt: gr.SelectData):
+    return evt.index
 def create_demo(model: ControlAnimationModel):
     with gr.Blocks() as demo:
+        with gr.Column():
             with gr.Row():
                 with gr.Column():
+                    # TODO: update so that model_link is customizable
+                    model_link = gr.Dropdown(
+                        label="Model Link",
+                        choices=["runwayml/stable-diffusion-v1-5"],
+                        value="runwayml/stable-diffusion-v1-5",
+                    )
+                    prompt = gr.Textbox(
+                        placeholder="Prompt",
+                        show_label=False,
+                        lines=2,
+                        info="Give a prompt for an animation you would like to generate. The prompt will be used to create the first initial frame and then the animation.",
                     )
+                    negative_prompt = gr.Textbox(
                         placeholder="Negative Prompt (optional)",
                         show_label=False,
                         lines=2,
                     )
                     gen_frames_button = gr.Button(
                         value="Generate Initial Frames", variant="primary"
                     )
                     with gr.Accordion("Advanced options", open=False):
                         if on_huggingspace:
                             video_length = gr.Slider(
                                 label="Video length", minimum=8, maximum=16, step=1
                         )
                 with gr.Column():
+                    gallery_pose_sequence = gr.Gallery(
+                        label="Pose Sequence",
+                        value=[
+                            ("__assets__/dance1.gif", "Motion 1"),
+                            ("__assets__/dance2.gif", "Motion 2"),
+                            ("__assets__/dance3.gif", "Motion 3"),
+                            ("__assets__/dance4.gif", "Motion 4"),
+                            ("__assets__/dance5.gif", "Motion 5"),
+                        ],
+                    ).style(columns=3)
+                    input_video_path = gr.Textbox(
+                        label="Pose Sequence", visible=False, value="Motion 1"
+                    )
+                    pose_sequence_selector = gr.Markdown("Pose Sequence: **Motion 1**")
+            with gr.Row():
+                with gr.Column(visible=True) as frame_selection_view:
+                    initial_frames = gr.Gallery(
+                        label="Initial Frames", show_label=False
+                    ).style(columns=4, rows=1, object_fit="contain", preview=True)
+                    gr.Markdown("Select an initial frame to start your animation with.")
+                    gen_animation_button = gr.Button(
+                        value="Select Initial Frame & Generate Animation",
+                        variant="secondary",
+                    )
+                with gr.Column(visible=False) as animation_view:
                     result = gr.Video(label="Generated Video")
+        with gr.Box(visible=False):
+            initial_frame_index = gr.Number(
+                label="Selected Initial Frame Index", value=-1, precision=0
+            )
+        input_video_path.change(on_video_path_update, None, pose_sequence_selector)
+        gallery_pose_sequence.select(pose_gallery_callback, None, input_video_path)
+        initial_frames.select(fn=get_frame_index, outputs=initial_frame_index)
+        frame_inputs = [
+            prompt,
+            input_video_path,
+            negative_prompt,
+        ]
+        animation_inputs = [
             prompt,
+            initial_frame_index,
+            input_video_path,
             model_link,
             motion_field_strength_x,
             motion_field_strength_y,
             t0,
             t1,
+            negative_prompt,
             chunk_size,
             video_length,
             merging_ratio,
             seed,
         ]
+        def submit_select(initial_frame_index: int):
+            if initial_frame_index != -1:  # More to next step
                 return {
+                    frame_selection_view: gr.update(visible=False),
+                    animation_view: gr.update(visible=True),
                 }
             return {
+                frame_selection_view: gr.update(visible=True),
+                animation_view: gr.update(visible=False),
             }
         gen_frames_button.click(
+            fn=model.generate_initial_frames,
             inputs=frame_inputs,
             outputs=initial_frames,
         )
         gen_animation_button.click(
+            fn=submit_select,
+            inputs=initial_frame_index,
+            outputs=[frame_selection_view, animation_view],
+        ).then(
+            fn=None,
+            inputs=animation_inputs,
             outputs=result,
         )
+        # gr.Examples(examples=examples,
+        #             inputs=inputs,
+        #             outputs=result,
+        #             fn=None,
+        #             run_on_click=False,
+        #             cache_examples=on_huggingspace,
+        # )
     return demo