Spaces:

atnikos
/

motionfix-demo

Running

App Files Files Community

atnikos commited on Dec 4, 2024

Commit

10ff2d6

1 Parent(s): cfbdd27

attempts to fix

Browse files

Files changed (5) hide show

app.py +16 -12
body_renderer.py +7 -5
dataset_utils.py +7 -3
download_deps.py +3 -4
tmed_denoiser.py +21 -14

app.py CHANGED Viewed

@@ -76,8 +76,9 @@ class MotionEditor:
         self.MFIX_p = download_motionfix() + '/motionfix'
         # self.SOURCE_MOTS_p = download_embeddings() + '/embeddings'
         self.MFIX_DATASET_DICT = download_motionfix_dataset()
-        self.model_ckpt_path = download_models("last_zipped")
-        self.model_config_feats = download_model_config()
     @spaces.GPU
     def initialize_if_needed(self):
@@ -113,18 +114,20 @@ class MotionEditor:
         self.infeats = self.model_config_feats
         checkpoint = torch.load(model_ckpt, map_location=self.device)
         checkpoint = {k.replace('denoiser.', ''): v for k, v in checkpoint.items()}
         # Setup denoiser
-        self.tmed_denoiser = TMED_denoiser().to(self.device)
         self.tmed_denoiser.load_state_dict(checkpoint, strict=False)
         self.tmed_denoiser.eval()
         # Setup diffusion
         self.diffusion = create_diffusion(
             timestep_respacing=None,
             learn_sigma=False,
             sigma_small=True,
-            diffusion_steps=300,
             noise_schedule='squaredcos_cap_v2',
             predict_xstart=True
         )
@@ -144,6 +147,7 @@ class MotionEditor:
     def process_motion(self, input_text, key_to_use):
         """Main processing function, GPU-decorated"""
         self.initialize_if_needed()
         # Load dataset sample
         ds_sample = self.MFIX_DATASET_DICT[key_to_use]
@@ -192,9 +196,8 @@ class MotionEditor:
         seqlen_tgt = target_motion.shape[0]
         cond_motion_mask = torch.ones((bsz, seqlen_src), dtype=bool, device=self.device)
         mask_target = torch.ones((bsz, seqlen_tgt), dtype=bool, device=self.device)
         # Generate diffusion output
-        diff_out = self.tmed_denoiser._diffusion_reverse(
             text_emb.to(self.device),
             text_mask.to(self.device),
             source_motion,
@@ -204,8 +207,8 @@ class MotionEditor:
             init_vec=None,
             init_from='noise',
             gd_text=2.0,
-            gd_motion=2.0,
-            steps_num=300
         )
         return self.denormalize_motion(diff_out)
@@ -213,13 +216,14 @@ class MotionEditor:
     def denormalize_motion(self, diff_out):
         """Denormalize motion - called from within GPU-decorated function"""
         from geometry_utils import diffout2motion
         return diffout2motion(diff_out.permute(1, 0, 2), self.normalizer).squeeze()
     def render_result(self, edited_motion, source_motion):
         """Render result - called from within GPU-decorated function"""
         from body_renderer import get_render
         from transform3d import transform_body_pose, rotate_body_degrees
-        # import ipdb; ipdb.set_trace()
         # Transform motions
         edited_motion_transformed = self.transform_motion(edited_motion)
         source_motion_transformed = self.transform_motion(source_motion)
@@ -227,7 +231,7 @@ class MotionEditor:
         # Render video
         if os.path.exists('./output_movie.mp4'):
             os.remove('./output_movie.mp4')
         return get_render(
             self.body_model,
             [edited_motion_transformed['trans'].detach().cpu(),

         self.MFIX_p = download_motionfix() + '/motionfix'
         # self.SOURCE_MOTS_p = download_embeddings() + '/embeddings'
         self.MFIX_DATASET_DICT = download_motionfix_dataset()
+        self.model_ckpt_path = download_models("899_bs128_zipped") # small_model_zipped_last/last_zipped
+        self.model_cfg = download_model_config('bs_128_conf') # small_model_config / big_model_config
+        self.model_config_feats = self.model_cfg.model.input_feats
     @spaces.GPU
     def initialize_if_needed(self):
         self.infeats = self.model_config_feats
         checkpoint = torch.load(model_ckpt, map_location=self.device)
         checkpoint = {k.replace('denoiser.', ''): v for k, v in checkpoint.items()}
         # Setup denoiser
+        self.tmed_denoiser = TMED_denoiser(latent_dim=self.model_cfg.model.latent_dim,
+                                           num_layers=8,
+                                           ff_size=1024,
+                                           num_heads=4).to(self.device)
         self.tmed_denoiser.load_state_dict(checkpoint, strict=False)
         self.tmed_denoiser.eval()
         # Setup diffusion
         self.diffusion = create_diffusion(
             timestep_respacing=None,
             learn_sigma=False,
             sigma_small=True,
+            diffusion_steps=self.model_cfg.model.diff_params.num_train_timesteps,
             noise_schedule='squaredcos_cap_v2',
             predict_xstart=True
         )
     def process_motion(self, input_text, key_to_use):
         """Main processing function, GPU-decorated"""
         self.initialize_if_needed()
+        # import ipdb; ipdb.set_trace()
         # Load dataset sample
         ds_sample = self.MFIX_DATASET_DICT[key_to_use]
         seqlen_tgt = target_motion.shape[0]
         cond_motion_mask = torch.ones((bsz, seqlen_src), dtype=bool, device=self.device)
         mask_target = torch.ones((bsz, seqlen_tgt), dtype=bool, device=self.device)
         # Generate diffusion output
+        diff_out = self.tmed_cenoiser._diffusion_reverse(
             text_emb.to(self.device),
             text_mask.to(self.device),
             source_motion,
             init_vec=None,
             init_from='noise',
             gd_text=2.0,
+            gd_motion=3.0,
+            steps_num=self.model_cfg.model.diff_params.num_train_timesteps
         )
         return self.denormalize_motion(diff_out)
     def denormalize_motion(self, diff_out):
         """Denormalize motion - called from within GPU-decorated function"""
         from geometry_utils import diffout2motion
+        # import ipdb; ipdb.set_trace()
         return diffout2motion(diff_out.permute(1, 0, 2), self.normalizer).squeeze()
     def render_result(self, edited_motion, source_motion):
         """Render result - called from within GPU-decorated function"""
         from body_renderer import get_render
         from transform3d import transform_body_pose, rotate_body_degrees
         # Transform motions
         edited_motion_transformed = self.transform_motion(edited_motion)
         source_motion_transformed = self.transform_motion(source_motion)
         # Render video
         if os.path.exists('./output_movie.mp4'):
             os.remove('./output_movie.mp4')
+        # import ipdb; ipdb.set_trace()
         return get_render(
             self.body_model,
             [edited_motion_transformed['trans'].detach().cpu(),

body_renderer.py CHANGED Viewed

@@ -30,7 +30,9 @@ def get_render(body_model_loaded,
     if not isinstance(body_pose, list):
         body_pose = [body_pose]
-    for trans, orient,pose in zip(body_trans,body_orient,body_pose):
         vertices= run_smpl_fwd_vertices(body_model_loaded,
                                         trans,
@@ -38,15 +40,15 @@ def get_render(body_model_loaded,
                                         pose)
         vertices=vertices.vertices
-        vertices = subsample_tensor(vertices, original_fps=30, target_fps=25)
         vertices = vertices.detach().cpu().numpy()
         vertices_list.append(vertices)
     #Initialising the renderer
     from renderer.humor import HumorRenderer
-    fps = 25.0
-    imw = 480 # 480
-    imh = 360 # 360
     renderer = HumorRenderer(fps=fps, imw=imw, imh=imh)
     if len(vertices_list)==2:

     if not isinstance(body_pose, list):
         body_pose = [body_pose]
+    for trans, orient, pose in zip(body_trans,
+                                  body_orient,
+                                  body_pose):
         vertices= run_smpl_fwd_vertices(body_model_loaded,
                                         trans,
                                         pose)
         vertices=vertices.vertices
+        # vertices = subsample_tensor(vertices, original_fps=30, target_fps=25)
         vertices = vertices.detach().cpu().numpy()
         vertices_list.append(vertices)
     #Initialising the renderer
     from renderer.humor import HumorRenderer
+    fps = 30.0
+    imw = 720 # 480
+    imh = 540 # 360
     renderer = HumorRenderer(fps=fps, imw=imw, imh=imh)
     if len(vertices_list)==2:

dataset_utils.py CHANGED Viewed

@@ -10,13 +10,17 @@ def load_motionfix(path_to_data):
     # Fill each dictionary with the corresponding data
     for key in splits['train']:
-        train_data[key] = dataset[key]
     for key in splits['val']:
-        val_data[key] = dataset[key]
     for key in splits['test']:
-        test_data[key] = dataset[key]
     validation_test_data = {**val_data, **test_data}
     return train_data, validation_test_data

     # Fill each dictionary with the corresponding data
     for key in splits['train']:
+        if key in dataset:
+            train_data[key] = dataset[key]
     for key in splits['val']:
+        if key in dataset:
+            val_data[key] = dataset[key]
     for key in splits['test']:
+        if key in dataset:
+            test_data[key] = dataset[key]
     validation_test_data = {**val_data, **test_data}
     return train_data, validation_test_data

download_deps.py CHANGED Viewed

@@ -16,13 +16,12 @@ def download_models(ckpt_to_dl):
     REPO_ID = 'athn-nik/example-model'
     return hf_hub_download(REPO_ID, filename=f"{ckpt_to_dl}.ckpt")
-def download_model_config():
     REPO_ID = 'athn-nik/example-model'
-    path_to_config = hf_hub_download(REPO_ID, filename="tmed/.hydra/config.yaml")
     from omegaconf import OmegaConf
     model_cfg = OmegaConf.load(path_to_config)
-    return model_cfg.model.input_feats
 def download_motion_from_dataset(key_to_dl):
     REPO_ID = 'athn-nik/example-model'

     REPO_ID = 'athn-nik/example-model'
     return hf_hub_download(REPO_ID, filename=f"{ckpt_to_dl}.ckpt")
+def download_model_config(config_name):
     REPO_ID = 'athn-nik/example-model'
+    path_to_config = hf_hub_download(REPO_ID, filename=f"{config_name}/.hydra/config.yaml")
     from omegaconf import OmegaConf
     model_cfg = OmegaConf.load(path_to_config)
+    return model_cfg
 def download_motion_from_dataset(key_to_dl):
     REPO_ID = 'athn-nik/example-model'

tmed_denoiser.py CHANGED Viewed

@@ -17,6 +17,7 @@ class TMED_denoiser(nn.Module):
                  text_encoded_dim: int = 768,
                  pred_delta_motion: bool = False,
                  use_sep: bool = True,
                  **kwargs) -> None:
         super().__init__()
@@ -28,6 +29,8 @@ class TMED_denoiser(nn.Module):
         self.pose_proj_in_source = nn.Linear(nfeats, self.latent_dim)
         self.pose_proj_in_target = nn.Linear(nfeats, self.latent_dim)
         self.pose_proj_out = nn.Linear(self.latent_dim, nfeats)
         # emb proj
         if self.condition in ["text", "text_uncond"]:
@@ -47,8 +50,9 @@ class TMED_denoiser(nn.Module):
         self.use_sep = use_sep
         self.query_pos = PositionalEncoding(self.latent_dim, dropout)
         self.mem_pos = PositionalEncoding(self.latent_dim, dropout)
-        if self.use_sep:
-            self.sep_token = nn.Parameter(torch.randn(1, self.latent_dim))
         # use torch transformer
         encoder_layer = nn.TransformerEncoderLayer(
@@ -83,7 +87,7 @@ class TMED_denoiser(nn.Module):
         # 1. time_embeddingno
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timestep.expand(noised_motion.shape[1]).clone().to(noised_motion.device)
         time_emb = self.embed_timestep(timesteps).to(dtype=noised_motion.dtype)
         # make it S first
         # time_emb = self.time_embedding(time_emb).unsqueeze(0)
@@ -119,16 +123,19 @@ class TMED_denoiser(nn.Module):
         # if self.diffusion_only:
         proj_noised_motion = self.pose_proj_in_target(noised_motion)
-        if self.use_sep:
-            sep_token_batch = torch.tile(self.sep_token, (bs,)).reshape(bs,
-                                                                        -1)
-            xseq = torch.cat((emb_latent, motion_embeds_proj,
-                            sep_token_batch[None],
-                            proj_noised_motion), axis=0)
         else:
-            xseq = torch.cat((emb_latent, motion_embeds_proj,
-                              proj_noised_motion), axis=0)
         # if self.ablation_skip_connection:
         #     xseq = self.query_pos(xseq)
         #     tokens = self.encoder(xseq)
@@ -249,8 +256,8 @@ class TMED_denoiser(nn.Module):
                 mask_src_parts = inpaint_dict['mask'].unsqueeze(1).repeat(1,
                                                                       mot_len,
                                                                       1)
-                uncond_eps = uncond_eps*(~mask_src_parts) + source_mot*mask_src_parts
-                cond_eps_text = cond_eps_text*(~mask_src_parts) + source_mot*mask_src_parts
             half_eps = uncond_eps + guidance_text_n_motion * (cond_eps_text - uncond_eps)
             eps = torch.cat([half_eps, half_eps], dim=0)
         else:

                  text_encoded_dim: int = 768,
                  pred_delta_motion: bool = False,
                  use_sep: bool = True,
+                 motion_condition: str = 'source',
                  **kwargs) -> None:
         super().__init__()
         self.pose_proj_in_source = nn.Linear(nfeats, self.latent_dim)
         self.pose_proj_in_target = nn.Linear(nfeats, self.latent_dim)
         self.pose_proj_out = nn.Linear(self.latent_dim, nfeats)
+        self.first_pose_proj = nn.Linear(self.latent_dim, nfeats)
+        self.motion_condition = motion_condition
         # emb proj
         if self.condition in ["text", "text_uncond"]:
         self.use_sep = use_sep
         self.query_pos = PositionalEncoding(self.latent_dim, dropout)
         self.mem_pos = PositionalEncoding(self.latent_dim, dropout)
+        if self.motion_condition == "source":
+            if self.use_sep:
+                self.sep_token = nn.Parameter(torch.randn(1, self.latent_dim))
         # use torch transformer
         encoder_layer = nn.TransformerEncoderLayer(
         # 1. time_embeddingno
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timestep.expand(noised_motion.shape[1]).clone()
         time_emb = self.embed_timestep(timesteps).to(dtype=noised_motion.dtype)
         # make it S first
         # time_emb = self.time_embedding(time_emb).unsqueeze(0)
         # if self.diffusion_only:
         proj_noised_motion = self.pose_proj_in_target(noised_motion)
+        if motion_embeds is None:
+            xseq = torch.cat((emb_latent, proj_noised_motion), axis=0)
         else:
+            if self.use_sep:
+                sep_token_batch = torch.tile(self.sep_token, (bs,)).reshape(bs,
+                                                                         -1)
+                xseq = torch.cat((emb_latent, motion_embeds_proj,
+                                sep_token_batch[None],
+                                proj_noised_motion), axis=0)
+            else:
+                xseq = torch.cat((emb_latent, motion_embeds_proj,
+                                  proj_noised_motion), axis=0)
         # if self.ablation_skip_connection:
         #     xseq = self.query_pos(xseq)
         #     tokens = self.encoder(xseq)
                 mask_src_parts = inpaint_dict['mask'].unsqueeze(1).repeat(1,
                                                                       mot_len,
                                                                       1)
+                uncond_eps = uncond_eps*(mask_src_parts) + source_mot*(~mask_src_parts)
+                cond_eps_text = cond_eps_text*(mask_src_parts) + source_mot*(~mask_src_parts)
             half_eps = uncond_eps + guidance_text_n_motion * (cond_eps_text - uncond_eps)
             eps = torch.cat([half_eps, half_eps], dim=0)
         else: