Spaces:

atnikos
/

motionfix-demo

Running

App Files Files Community

atnikos commited on Jun 27, 2024

Commit

6837c8b

1 Parent(s): 777e3d5

fix retrieval placeholders

Browse files

Files changed (7) hide show

app.py +170 -94
gen_utils.py +62 -2
model_utils.py +35 -1
renderer.py +138 -0
requirements.txt +1 -0
retrieval_loader.py +67 -0
tmr_model.py +128 -0

app.py CHANGED Viewed

@@ -3,40 +3,62 @@ import gradio as gr
 import spaces
 import torch
 import random
 zero = torch.Tensor([0]).cuda()
-print(zero.device) # <-- 'cpu' 🤔
-# G&uumll Varol
 DEFAULT_TEXT = "A person is "
-WEBSITE = """
-<div class="embed_hidden">
-<h1 style='text-align: center'> ACRONYM: The actual title </h1>
-<h2 style='text-align: center'>
-<a href="https://google.com" target="_blank"><nobr>fname m. lname</nobr></a> &emsp;
-<a href="https://google.com" target="_blank"><nobr>fname m. lname</nobr></a> &emsp;
-<a href="https://google.com" target="_blank"><nobr>fname m. lname</nobr></a>
-</h2>
-<h2 style='text-align: center'>
-<nobr>XXX 2024</nobr>
-</h2>
-<h3 style="text-align:center;">
-<a target="_blank" href="https://arxiv.org/"> <button type="button" class="btn btn-primary btn-lg"> Paper </button></a>
-<a target="_blank" href="https://github.com/"> <button type="button" class="btn btn-primary btn-lg"> Code </button></a>
-<a target="_blank" href="google.com"> <button type="button" class="btn btn-primary btn-lg"> Webpage </button></a>
-<a target="_blank" href="bibfile.com"> <button type="button" class="btn btn-primary btn-lg"> BibTex </button></a>
-</h3>
-<h3> Description </h3>
-<p>
-This space illustrates <a href='project.com' target='_blank'><b>XXX</b></a>, a method for XXX.
-What does it do?
-</p>
 </div>
-"""
 @spaces.GPU
 def greet(n):
@@ -50,82 +72,136 @@ def greet(n):
 def clear():
     return ""
-def random_number():
-    return str(random.uniform(0, 100))
 from huggingface_hub import hf_hub_download, hf_hub_url, cached_download
 def download_models():
     REPO_ID = 'athn-nik/example-model'
     return hf_hub_download(REPO_ID, filename="min_checkpoint.ckpt")
 with gr.Blocks() as demo:
     gr.Markdown(WEBSITE)
-    input_text = gr.Textbox(placeholder="Type the edit text you want:",
-                    show_label=True,label="Input Text", value=DEFAULT_TEXT)
-    # output_text = gr.Textbox(label="Output Text")
     with gr.Row():
         retrieve_button = gr.Button("Retrieve")
-        clear_button = gr.Button("Clear")
         random_button = gr.Button("Random")
-        from normalization import Normalizer
-        normalizer = Normalizer()
-        # tmed_den = load_model()
-        from diffusion import create_diffusion
-        from text_encoder import ClipTextEncoder
-        from tmed_denoiser import TMED_denoiser
-        model_ckpt = download_models()
-        checkpoint = torch.load(model_ckpt)
-        checkpoint = {k.replace('denoiser.', ''): v for k, v in checkpoint.items()}
-        tmed_denoiser = TMED_denoiser().to('cuda')
-        tmed_denoiser.load_state_dict(checkpoint, strict=False)
-        tmed_denoiser.eval()
-        text_encoder = ClipTextEncoder()
-        texts_cond = [input_text.value]
-        diffusion_process = create_diffusion(timestep_respacing=None,
-                                             learn_sigma=False, sigma_small=True,
-                                             diffusion_steps=300,
-                                             noise_schedule='squaredcos_cap_v2',
-                                             predict_xstart=True) # noise vs sample
-        # uncond_tokens = [""] * len(texts_cond)
-        # if self.condition == 'text':
-        #     uncond_tokens.extend(texts_cond)
-        # elif self.condition == 'text_uncond':
-        #     uncond_tokens.extend(uncond_tokens)
-        bsz = 1
-        seqlen_tgt = 180
-        no_of_texts = len(texts_cond)
-        texts_cond = ['']*no_of_texts + texts_cond
-        texts_cond = ['']*no_of_texts + texts_cond
-        print(texts_cond)
-        text_emb, text_mask = text_encoder(texts_cond)
-        cond_emb_motion = torch.zeros(1, bsz,
-                                        512,
-                                        device='cuda')
-        cond_motion_mask = torch.ones((bsz, 1),
-                                    dtype=bool, device='cuda')
-        mask_target =  torch.ones((1, bsz),
-                                    dtype=bool, device='cuda')
-        # complete noise
-        # import ipdb;ipdb.set_trace()
-        diff_out = tmed_denoiser._diffusion_reverse(text_emb.to(cond_emb_motion.device),
-                                                   text_mask.to(cond_emb_motion.device),
-                                                   cond_emb_motion,
-                                                   cond_motion_mask,
-                                                   mask_target,
-                                                   diffusion_process,
-                                                   init_vec=None,
-                                                   init_from='noise',
-                                                   gd_text=4.0,
-                                                   gd_motion=2.0,
-                                                   steps_num=300)
-        edited_motion = diffout2motion(diff_out, normalizer)
-    clear_button.click(clear, outputs=input_text)
     random_button.click(random_number, outputs=input_text)
 demo.launch()

 import spaces
 import torch
 import random
+import os
+from pathlib import Path
+from aitviewer.headless import HeadlessRenderer
+from aitviewer.configuration import CONFIG as AITVIEWER_CONFIG
+# import cv2
+# import moderngl
+# ctx = moderngl.create_context(standalone=True)
+# print(ctx)
+access_token_smpl = os.environ.get('HF_SMPL_TOKEN')
 zero = torch.Tensor([0]).cuda()
+print(zero.device) # <-- 'cuda:0' 🤗
 DEFAULT_TEXT = "A person is "
+from aitviewer.models.smpl import SMPLLayer
+def get_smpl_models():
+    REPO_ID = 'athn-nik/smpl_models'
+    from huggingface_hub import snapshot_download
+    return snapshot_download(repo_id=REPO_ID, allow_patterns="smplh*",
+                             token=access_token_smpl)
+def get_renderer():
+    from aitviewer.headless import HeadlessRenderer
+    from aitviewer.configuration import CONFIG as AITVIEWER_CONFIG
+    smpl_models_path = str(Path(get_smpl_models()))
+    AITVIEWER_CONFIG.update_conf({'playback_fps': 30,
+                                  'auto_set_floor': True,
+                                  'smplx_models': smpl_models_path,
+                                  'z_up': True})
+    return HeadlessRenderer()
+WEBSITE = ("""<div class="embed_hidden" style="text-align: center;">
+    <h1>MotionFix: Text-Driven 3D Human Motion Editing</h1>
+    <h3>
+        <a href="https://is.mpg.de/person/~nathanasiou" target="_blank" rel="noopener noreferrer">Nikos Athanasiou</a><sup>1</sup>,
+        <a href="https://is.mpg.de/person/acseke" target="_blank" rel="noopener noreferrer">Alpar Cseke</a><sup>1</sup>,
+        <br>
+        <a href="https://ps.is.mpg.de/person/mdiomataris" target="_blank" rel="noopener noreferrer">Markos Diomataris</a><sup>1, 3</sup>,
+        <a href="https://is.mpg.de/person/black" target="_blank" rel="noopener noreferrer">Michael J. Black</a><sup>1</sup>,
+        <a href="https://imagine.enpc.fr/~varolg/" target="_blank" rel="noopener noreferrer">G&uuml;l Varol</a><sup>2</sup>,
+    </h3>
+    <h3>
+        <sup>1</sup>Max Planck Institute for Intelligent Systems, T&uuml;bingen, Germany;
+        <sup>2</sup>LIGM, &Eacute;cole des Ponts, Univ Gustave Eiffel, CNRS, France,
+        <sup>3</sup>ETH Z&uuml;rich, Switzerland
+    </h3>
+</div>
+<div style="display:flex; gap: 0.3rem; justify-content: center; align-items: center;" align="center">
+<a href='https://arxiv.org/abs/'><img src='https://img.shields.io/badge/Arxiv-2405.20340-A42C25?style=flat&logo=arXiv&logoColor=A42C25'></a>
+<a href='https://arxiv.org/pdf/'><img src='https://img.shields.io/badge/Paper-PDF-yellow?style=flat&logo=arXiv&logoColor=yellow'></a>
+<a href='https://motionfix.is.tue.mpg.de'><img src='https://img.shields.io/badge/Project-Page-%23df5b46?style=flat&logo=Google%20chrome&logoColor=%23df5b46'></a>
+<a href='https://youtube.com/'><img src='https://img.shields.io/badge/YouTube-red?style=flat&logo=youtube&logoColor=white'></a>
 </div>
+""")
 @spaces.GPU
 def greet(n):
 def clear():
     return ""
+def show_video(input_text):
+    from normalization import Normalizer
+    normalizer = Normalizer()
+    from diffusion import create_diffusion
+    from text_encoder import ClipTextEncoder
+    from tmed_denoiser import TMED_denoiser
+    model_ckpt = download_models()
+    checkpoint = torch.load(model_ckpt)
+    checkpoint = {k.replace('denoiser.', ''): v for k, v in checkpoint.items()}
+    tmed_denoiser = TMED_denoiser().to('cuda')
+    tmed_denoiser.load_state_dict(checkpoint, strict=False)
+    tmed_denoiser.eval()
+    text_encoder = ClipTextEncoder()
+    texts_cond = [input_text]
+    diffusion_process = create_diffusion(timestep_respacing=None,
+                                         learn_sigma=False, sigma_small=True,
+                                         diffusion_steps=300,
+                                         noise_schedule='squaredcos_cap_v2',
+                                         predict_xstart=True)
+    bsz = 1
+    seqlen_tgt = 180
+    no_of_texts = len(texts_cond)
+    texts_cond = ['']*no_of_texts + texts_cond
+    texts_cond = ['']*no_of_texts + texts_cond
+    text_emb, text_mask = text_encoder(texts_cond)
+    cond_emb_motion = torch.zeros(seqlen_tgt, bsz,
+                                  512,
+                                  device='cuda')
+    cond_motion_mask = torch.ones((bsz, seqlen_tgt),
+                                  dtype=bool, device='cuda')
+    mask_target = torch.ones((bsz, seqlen_tgt),
+                             dtype=bool, device='cuda')
+    diff_out = tmed_denoiser._diffusion_reverse(text_emb.to(cond_emb_motion.device),
+                                                text_mask.to(cond_emb_motion.device),
+                                                cond_emb_motion,
+                                                cond_motion_mask,
+                                                mask_target,
+                                                diffusion_process,
+                                                init_vec=None,
+                                                init_from='noise',
+                                                gd_text=4.0,
+                                                gd_motion=2.0,
+                                                steps_num=300)
+    edited_motion = diffout2motion(diff_out, normalizer).squeeze()
+    from renderer import render_motion, color_map, pack_to_render
+    # aitrenderer = get_renderer()
+    AIT_RENDERER = get_renderer()
+    SMPL_LAYER = SMPLLayer(model_type='smplh', ext='npz', gender='neutral')
+    edited_mot_to_render = pack_to_render(rots=edited_motion[..., 3:],
+                                          trans=edited_motion[..., :3])
+    import random
+    xx = random.randint(1, 1000)
+    fname = render_motion(AIT_RENDERER, [edited_mot_to_render],
+                          f"movie_example--{str(xx)}",
+                          pose_repr='aa',
+                          color=[color_map['generated']],
+                          smpl_layer=SMPL_LAYER)
+    return fname
+def retrieve_video(retrieve_text):
+    pass
 from huggingface_hub import hf_hub_download, hf_hub_url, cached_download
 def download_models():
     REPO_ID = 'athn-nik/example-model'
     return hf_hub_download(REPO_ID, filename="min_checkpoint.ckpt")
+def download_tmr():
+    REPO_ID = 'athn-nik/example-model'
+    # return hf_hub_download(REPO_ID, filename="min_checkpoint.ckpt")
+    from huggingface_hub import snapshot_download
+    return snapshot_download(repo_id=REPO_ID, allow_patterns="tmr*",
+                             token=access_token_smpl)
+import gradio as gr
+def clear():
+    return ""
+def random_number():
+    return "Random text"
 with gr.Blocks() as demo:
     gr.Markdown(WEBSITE)
     with gr.Row():
+        with gr.Column(scale=8):
+            retrieve_text = gr.Textbox(placeholder="Type the text for the motion you want to Retrieve:",
+                                       show_label=True, label="Retrieval Text", value=DEFAULT_TEXT)
+        with gr.Column(scale=1):
+            clear_button_retrieval = gr.Button("Clear Retrieval Text")
+    with gr.Row():
+        with gr.Column(scale=8):
+            input_text = gr.Textbox(placeholder="Type the edit text you want:",
+                                    show_label=True, label="Input Text", value=DEFAULT_TEXT)
+        with gr.Column(scale=1):
+            clear_button_edit = gr.Button("Clear Edit Text")
+    with gr.Row():
+        video_output = gr.Video(label="Generated Video", height=240, width=320)
+        retrieved_video_output = gr.Video(label="Retrieved Motion", height=240, width=320)
+    with gr.Row():
+        edit_button = gr.Button("Edit")
         retrieve_button = gr.Button("Retrieve")
         random_button = gr.Button("Random")
+    def process_and_show_video(input_text):
+        fname = show_video(input_text)
+        return fname
+    def process_and_retrieve_video(input_text):
+        fname = retrieve_video(input_text)
+        return fname
+    from gen_utils import read_config
+    from retrieval_loader import load_model_from_cfg
+    from retrieval_loader import get_tmr_model
+    tmr = get_tmr_model(download_tmr())
+    edit_button.click(process_and_show_video, inputs=input_text, outputs=video_output)
+    retrieve_button.click(process_and_retrieve_video, inputs=retrieve_text, outputs=retrieved_video_output)
+    # import ipdb;ipdb.set_trace()
+    clear_button_edit.click(clear, outputs=input_text)
+    clear_button_retrieval.click(clear, outputs=retrieve_text)
     random_button.click(random_number, outputs=input_text)
 demo.launch()

gen_utils.py CHANGED Viewed

@@ -1,5 +1,10 @@
 import torch
 import numpy as np
 def cast_dict_to_tensors(d, device="cpu"):
     if isinstance(d, dict):
         return {k: cast_dict_to_tensors(v, device) for k, v in d.items()}
@@ -9,5 +14,60 @@ def cast_dict_to_tensors(d, device="cpu"):
         return d.to(device)
     else:
         return d

 import torch
 import numpy as np
+import logging
+import os
+logger = logging.getLogger(__name__)
 def cast_dict_to_tensors(d, device="cpu"):
     if isinstance(d, dict):
         return {k: cast_dict_to_tensors(v, device) for k, v in d.items()}
         return d.to(device)
     else:
         return d
+def rgba(c: str):
+    from matplotlib import colors as mcolors
+    return mcolors.to_rgba(c)
+def rgb(c: str):
+    from matplotlib import colors as mcolors
+    return mcolors.to_rgb(c)
+# split the lightning checkpoint into
+# seperate state_dict modules for faster loading
+def extract_ckpt(run_dir, ckpt_name="last"):
+    import torch
+    ckpt_path = os.path.join(run_dir, f"logs/checkpoints/{ckpt_name}.ckpt")
+    extracted_path = os.path.join(run_dir, f"{ckpt_name}_weights")
+    os.makedirs(extracted_path, exist_ok=True)
+    new_path_template = os.path.join(extracted_path, "{}.pt")
+    ckpt_dict = torch.load(ckpt_path)
+    state_dict = ckpt_dict["state_dict"]
+    module_names = list(set([x.split(".")[0] for x in state_dict.keys()]))
+    # should be ['motion_encoder', 'text_encoder', 'motion_decoder'] for example
+    for module_name in module_names:
+        path = new_path_template.format(module_name)
+        sub_state_dict = {
+            ".".join(x.split(".")[1:]): y.cpu()
+            for x, y in state_dict.items()
+            if x.split(".")[0] == module_name
+        }
+        torch.save(sub_state_dict, path)
+import os
+import json
+from omegaconf import DictConfig, OmegaConf
+def save_config(cfg: DictConfig) -> str:
+    path = os.path.join(cfg.run_dir, "config.json")
+    config = OmegaConf.to_container(cfg, resolve=True)
+    with open(path, "w") as f:
+        string = json.dumps(config, indent=4)
+        f.write(string)
+    return path
+def read_config(run_dir: str, return_json=False) -> DictConfig:
+    path = os.path.join(run_dir, "config.json")
+    with open(path, "r") as f:
+        config = json.load(f)
+    if return_json:
+        return config
+    cfg = OmegaConf.create(config)
+    cfg.run_dir = run_dir
+    return cfg

model_utils.py CHANGED Viewed

@@ -3,6 +3,10 @@ import torch.nn as nn
 import numpy as np
 import torch
 from torch import nn
 class TimestepEmbedderMDM(nn.Module):
     def __init__(self, latent_dim):
@@ -61,4 +65,34 @@ class PositionalEncoding(nn.Module):
         else:
             last = first + x.shape[0]
             x = x + self.pe[first:last, :]
-        return self.dropout(x)

 import numpy as np
 import torch
 from torch import nn
+import torch
+from typing import List, Dict, Optional
+from torch import Tensor
 class TimestepEmbedderMDM(nn.Module):
     def __init__(self, latent_dim):
         else:
             last = first + x.shape[0]
             x = x + self.pe[first:last, :]
+        return self.dropout(x)
+def collate_tensor_with_padding(batch: List[Tensor]) -> Tensor:
+    dims = batch[0].dim()
+    max_size = [max([b.size(i) for b in batch]) for i in range(dims)]
+    size = (len(batch),) + tuple(max_size)
+    canvas = batch[0].new_zeros(size=size)
+    for i, b in enumerate(batch):
+        sub_tensor = canvas[i]
+        for d in range(dims):
+            sub_tensor = sub_tensor.narrow(d, 0, b.size(d))
+        sub_tensor.add_(b)
+    return canvas
+def collate_x_dict(lst_x_dict: List, *, device: Optional[str] = 'cuda') -> Dict:
+    x = collate_tensor_with_padding([x_dict["x"] for x_dict in lst_x_dict])
+    if device is not None:
+        x = x.to(device)
+    length = [x_dict["length"] for x_dict in lst_x_dict]
+    if isinstance(length, list):
+        length = torch.tensor(length, device=device)
+    max_len = max(length)
+    mask = torch.arange(max_len, device=device).expand(
+        len(length), max_len
+    ) < length.unsqueeze(1)
+    batch = {"x": x, "length": length, "mask": mask}
+    return batch

renderer.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import os
+import torch
+from transform3d import transform_body_pose
+from aitviewer.headless import HeadlessRenderer
+from gen_utils import rgb, rgba
+color_map = {
+             'source_motion': rgba('darkred'),
+             'source': rgba('darkred'),
+             'target_motion': rgba('olivedrab'),
+             'input': rgba('olivedrab'),
+             'target': rgba('olivedrab'),
+             'generation': rgba('purple'),
+             'generated': rgba('steelblue'),
+             'denoised': rgba('purple'),
+             'noised': rgba('darkgrey'),
+             }
+def pack_to_render(rots, trans, pose_repr='6d'):
+    # make axis-angle
+    # global_orient = transform_body_pose(rots, f"{pose_repr}->aa")
+    if rots.is_cuda:
+        rots = rots.detach().cpu()
+    if trans.is_cuda:
+        trans = trans.detach().cpu()
+    if pose_repr != 'aa':
+        body_pose = transform_body_pose(rots, f"{pose_repr}->aa")
+    else:
+        body_pose = rots
+    if trans is None:
+        trans = torch.zeros((rots.shape[0], rots.shape[1], 3),
+                             device=rots.device)
+    render_d = {'body_transl': trans,
+                'body_orient': body_pose[..., :3],
+                'body_pose': body_pose[..., 3:]}
+    return render_d
+def render_motion(renderer: HeadlessRenderer, datum: dict,
+                  filename: str, pose_repr='6d',
+                  color=(160 / 255, 160 / 255, 160 / 255, 1.0),
+                  return_verts=False, smpl_layer=None) -> None:
+    """
+    Function to render a video of a motion sequence
+    renderer: aitviewer renderer
+    datum: dictionary containing sequence of poses, body translations and body orientations
+        data could be numpy or pytorch tensors
+    filename: the absolute path you want the video to be saved at
+    """
+    from aitviewer.headless import HeadlessRenderer
+    from aitviewer.renderables.smpl import SMPLSequence
+    if isinstance(datum, dict): datum = [datum]
+    if not isinstance(color, list):
+        colors = [color]
+    else:
+        colors = color
+    # assert {'body_transl', 'body_orient', 'body_pose'}.issubset(set(datum[0].keys()))
+    # os.environ['DISPLAY'] = ":11"
+    gender = 'neutral'
+    only_skel = False
+    import sys
+    seqs_of_human_motions = []
+    if smpl_layer is None:
+        from aitviewer.models.smpl import SMPLLayer
+        smpl_layer = SMPLLayer(model_type='smplh',
+                                ext='npz',
+                                gender=gender)
+    for iid, mesh_seq in enumerate(datum):
+        if pose_repr != 'aa':
+            global_orient = transform_body_pose(mesh_seq['body_orient'],
+                                                f"{pose_repr}->aa")
+            body_pose = transform_body_pose(mesh_seq['body_pose'],
+                                            f"{pose_repr}->aa")
+        else:
+            global_orient = mesh_seq['body_orient']
+            body_pose = mesh_seq['body_pose']
+        body_transl = mesh_seq['body_transl']
+        sys.stdout.flush()
+        old = os.dup(1)
+        os.close(1)
+        os.open(os.devnull, os.O_WRONLY)
+        print(body_pose.shape)
+        print('\n')
+        smpl_template = SMPLSequence(body_pose,
+                                     smpl_layer,
+                                     poses_root=global_orient,
+                                     trans=body_transl,
+                                     color=colors[iid],
+                                     z_up=True)
+        if only_skel:
+            smpl_template.remove(smpl_template.mesh_seq)
+        seqs_of_human_motions.append(smpl_template)
+        renderer.scene.add(smpl_template)
+    # camera follows smpl sequence
+    # FIX CAMERA
+    from transform3d import get_z_rot
+    R_z = get_z_rot(global_orient[0], in_format='aa')
+    heading = -R_z[:, 1]
+    xy_facing = body_transl[0] + heading*2.5
+    camera = renderer.lock_to_node(seqs_of_human_motions[0],
+                                    (xy_facing[0], xy_facing[1], 1.5), smooth_sigma=5.0)
+    # /FIX CAMERA
+    if len(mesh_seq['body_pose']) == 1:
+        renderer.save_frame(file_path=str(filename) + '.png')
+        sfx = 'png'
+    else:
+        renderer.save_video(video_dir=str(filename), output_fps=30)
+        sfx = 'mp4'
+    # aitviewer adds a counter to the filename, we remove it
+    # filename.split('_')[-1].replace('.mp4', '')
+    # os.rename(filename + '_0.mp4', filename[:-4] + '.mp4')
+    if sfx == 'mp4':
+        os.rename(str(filename) + f'_0.{sfx}', str(filename) + f'.{sfx}')
+    # empty scene for the next rendering
+    for mesh in seqs_of_human_motions:
+        renderer.scene.remove(mesh)
+    renderer.scene.remove(camera)
+    sys.stdout.flush()
+    os.close(1)
+    os.dup(old)
+    os.close(old)
+    renderer.reset()
+    fname = f'{filename}.{sfx}'
+    return fname

requirements.txt CHANGED Viewed

@@ -2,3 +2,4 @@ spaces
 gradio==4.36.1
 torch
 transformers==4.41.2

 gradio==4.36.1
 torch
 transformers==4.41.2
+hydra-core

retrieval_loader.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from gen_utils import extract_ckpt
+import hydra
+import os
+from hydra.utils import instantiate
+from gen_utils import read_config
+from model_utils import collate_x_dict
+import torch
+from tmr_model import TMR_textencoder
+def load_model_from_cfg(cfg, ckpt_name="last", device="cuda", eval_mode=True):
+    import src.prepare  # noqa
+    import torch
+    run_dir = cfg.run_dir
+    model = hydra.utils.instantiate(cfg.model)
+    # Loading modules one by one
+    # motion_encoder / text_encoder / text_decoder
+    pt_path = os.path.join(run_dir, f"{ckpt_name}_weights")
+    if not os.path.exists(pt_path):
+        extract_ckpt(run_dir, ckpt_name)
+    for fname in os.listdir(pt_path):
+        module_name, ext = os.path.splitext(fname)
+        if ext != ".pt":
+            continue
+        module = getattr(model, module_name, None)
+        if module is None:
+            continue
+        module_path = os.path.join(pt_path, fname)
+        state_dict = torch.load(module_path)
+        module.load_state_dict(state_dict)
+    model = model.to(device)
+    if eval_mode:
+        model = model.eval()
+    return model
+# def get_tmr_model(run_dir):
+#     from gen_utils import read_config
+#     cfg = read_config(run_dir+'/tmr')
+#     import ipdb;ipdb.set_trace()
+#     text_model = instantiate(cfg.data.text_to_token_emb, device='cuda')
+#     model = load_model_from_cfg(cfg, 'last', eval_mode=True, device='cuda')
+#     return text_model, model
+def get_tmr_model(run_dir):
+    text_params = {
+        "latent_dim": 256,
+        "ff_size": 1024,
+        "num_layers": 6,
+        "num_heads": 4,
+        "activation": "gelu",
+        "modelpath": "distilbert-base-uncased",
+    }
+    "unit_motion_embs"
+    model = TMR_textencoder(**text_params)
+    state_dict = torch.load(f"{run_dir}/tmr/last_weights/text_encoder.pt",
+                            map_location='cuda')
+    # load values for the transformer only
+    model.load_state_dict(state_dict, strict=False)
+    model = model.eval()
+    return model.to('cuda')

tmr_model.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from typing import List
+import torch.nn as nn
+import os
+import torch
+import numpy as np
+from torch import Tensor
+from transformers import AutoTokenizer, AutoModel
+from transformers import logging
+from torch.nn.functional import normalize
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe, persistent=False)
+    def forward(self, x):
+        return x + self.pe[:x.shape[0], :]
+class TMR_textencoder(nn.Module):
+    def __init__(self, modelpath: str, latent_dim: int, ff_size: int,
+                 num_layers: int, num_heads: int, activation: str, **kwargs) -> None:
+        super().__init__()
+        logging.set_verbosity_error()
+        # Tokenizer
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        self.tokenizer = AutoTokenizer.from_pretrained(modelpath)
+        # Text model
+        self.text_model = AutoModel.from_pretrained(modelpath)
+        # Then configure the model
+        self.text_encoded_dim = self.text_model.config.hidden_size
+        # Projection of the text-outputs into the latent space
+        self.projection = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(self.text_encoded_dim, latent_dim)
+        )
+        self.mu_token = nn.Parameter(torch.randn(latent_dim))
+        self.logvar_token = nn.Parameter(torch.randn(latent_dim))
+        self.sequence_pos_encoding = PositionalEncoding(latent_dim)
+        seq_trans_encoder_layer = nn.TransformerEncoderLayer(d_model=latent_dim,
+                                                             nhead=num_heads,
+                                                             dim_feedforward=ff_size,
+                                                             dropout=0.0,
+                                                             activation=activation)
+        self.seqTransEncoder = nn.TransformerEncoder(
+            seq_trans_encoder_layer,
+            num_layers=num_layers
+        )
+    def get_last_hidden_state(self, texts: List[str],
+                              return_mask: bool = False):
+        encoded_inputs = self.tokenizer(texts, return_tensors="pt", padding=True)
+        output = self.text_model(**encoded_inputs.to(self.text_model.device))
+        if not return_mask:
+            return output.last_hidden_state
+        return output.last_hidden_state, encoded_inputs.attention_mask.to(dtype=bool)
+    def forward(self, texts: List[str]) -> Tensor:
+        text_encoded, mask = self.get_last_hidden_state(texts, return_mask=True)
+        x = self.projection(text_encoded)
+        bs, nframes, _ = x.shape
+        # bs, nframes, totjoints, nfeats = x.shape
+        # Switch sequence and batch_size because the input of
+        # Pytorch Transformer is [Sequence, Batch size, ...]
+        x = x.permute(1, 0, 2)  # now it is [nframes, bs, latent_dim]
+        mu_token = torch.tile(self.mu_token, (bs,)).reshape(bs, -1)
+        logvar_token = torch.tile(self.logvar_token, (bs,)).reshape(bs, -1)
+        # adding the distribution tokens for all sequences
+        xseq = torch.cat((mu_token[None], logvar_token[None], x), 0)
+        # create a bigger mask, to allow attend to mu and logvar
+        token_mask = torch.ones((bs, 2), dtype=bool, device=x.device)
+        aug_mask = torch.cat((token_mask, mask), 1)
+        # add positional encoding
+        xseq = self.sequence_pos_encoding(xseq)
+        final = self.seqTransEncoder(xseq, src_key_padding_mask=~aug_mask)
+        # only mu for inference
+        mu = final[0]
+        return mu
+    # compute score for retrieval
+    def compute_scores(self, texts, unit_embs=None, embs=None):
+        # not both empty
+        assert not (unit_embs is None and embs is None)
+        # not both filled
+        assert not (unit_embs is not None and embs is not None)
+        output_str = False
+        # if one input, squeeze the output
+        if isinstance(texts, str):
+            texts = [texts]
+            output_str = True
+        # compute unit_embs from embs if not given
+        if embs is not None:
+            unit_embs = normalize(embs)
+        with torch.no_grad():
+            latent_unit_texts = normalize(self(texts))
+            # compute cosine similarity between 0 and 1
+            scores = (unit_embs @ latent_unit_texts.T).T/2 + 0.5
+            scores = scores.cpu().numpy()
+        if output_str:
+            scores = scores[0]
+        return scores