Upload CondViTForEmbedding

Browse files

Files changed (6) hide show

config.json +23 -0
hf_model.py +71 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +357 -0
module.py +167 -0

config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "__debug_save",
+  "architectures": [
+    "CondViTForEmbedding"
+  ],
+  "auto_map": {
+    "AutoConfig": "hf_model.CondViTConfig",
+    "AutoModel": "hf_model.CondViTForEmbedding"
+  },
+  "device": "cpu",
+  "heads": 12,
+  "input_resolution": 224,
+  "layers": 12,
+  "lm_backbone": "sentence-transformers/sentence-t5-xl",
+  "lm_revision": "e0976ba9afd18be963c22c680367a3928c44fd22",
+  "model_type": "condvit",
+  "n_categories": 10,
+  "output_dim": 512,
+  "patch_size": 16,
+  "torch_dtype": "float32",
+  "transformers_version": "4.37.1",
+  "width": 768
+}

hf_model.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from transformers import PreTrainedModel, PretrainedConfig
+from .module import ConditionalViT
+from sentence_transformers import SentenceTransformer
+class CondViTConfig(PretrainedConfig):
+    model_type = "condvit"
+    def __init__(
+        self,
+        input_resolution: int = 224,
+        patch_size: int = 16,
+        width: int = 768,
+        layers: int = 12,
+        heads: int = 12,
+        output_dim: int = 512,
+        n_categories: int = 10,
+        lm_backbone: str = "sentence-transformers/sentence-t5-xl",
+        lm_revision: str = "e0976ba9afd18be963c22c680367a3928c44fd22",
+        device: str = "cpu",
+        **kwargs
+    ):
+        self.input_resolution = input_resolution
+        self.patch_size = patch_size
+        self.width = width
+        self.layers = layers
+        self.heads = heads
+        self.output_dim = output_dim
+        self.n_categories = n_categories
+        self.lm_backbone = lm_backbone
+        self.lm_revision = lm_revision
+        self.device = device
+        super().__init__(**kwargs)
+class CondViTForEmbedding(PreTrainedModel):
+    config_class = CondViTConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.condvit = ConditionalViT(
+            input_resolution=config.input_resolution,
+            patch_size=config.patch_size,
+            width=config.width,
+            layers=config.layers,
+            heads=config.heads,
+            output_dim=config.output_dim,
+        )
+        if config.device:
+            self.condvit.to(config.device)
+        self.lm = SentenceTransformer(
+            config.lm_backbone, revision=config.lm_revision, device=config.device
+        )
+    def forward(self, pixel_values, texts=None):
+        if texts is not None:
+            text_embeddings = self.lm.encode(
+                texts,
+                convert_to_tensor=True,
+                convert_to_numpy=False,
+            )
+            text_embeddings = text_embeddings.to(pixel_values.device)
+        else:
+            text_embeddings = None
+        return self.condvit(imgs=pixel_values, c=text_embeddings)

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55b611cb11b4bf73bf8715538c3be0240daabdd2a48314c214e5cfa9c1adb742
+size 4972895436

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:339d88b3be497bb3333457701c9487260fc0a270388a3029d6869322d83ee3d5
+size 338708184

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,357 @@

+{
+  "metadata": {
+    "total_size": 5311558660
+  },
+  "weight_map": {
+    "condvit.c_pos_embedding": "model-00001-of-00002.safetensors",
+    "condvit.class_embedding": "model-00001-of-00002.safetensors",
+    "condvit.conv1.weight": "model-00001-of-00002.safetensors",
+    "condvit.ln_post.bias": "model-00001-of-00002.safetensors",
+    "condvit.ln_post.weight": "model-00001-of-00002.safetensors",
+    "condvit.ln_pre.bias": "model-00001-of-00002.safetensors",
+    "condvit.ln_pre.weight": "model-00001-of-00002.safetensors",
+    "condvit.logit_scale": "model-00001-of-00002.safetensors",
+    "condvit.positional_embedding": "model-00001-of-00002.safetensors",
+    "condvit.proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.0.attn.in_proj_bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.0.attn.in_proj_weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.0.attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.0.attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.0.ln_1.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.0.ln_1.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.0.ln_2.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.0.ln_2.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.0.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.0.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.0.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.0.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.1.attn.in_proj_bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.1.attn.in_proj_weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.1.attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.1.attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.1.ln_1.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.1.ln_1.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.1.ln_2.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.1.ln_2.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.1.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.1.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.1.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.1.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.10.attn.in_proj_bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.10.attn.in_proj_weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.10.attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.10.attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.10.ln_1.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.10.ln_1.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.10.ln_2.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.10.ln_2.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.10.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.10.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.10.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.10.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.11.attn.in_proj_bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.11.attn.in_proj_weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.11.attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.11.attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.11.ln_1.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.11.ln_1.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.11.ln_2.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.11.ln_2.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.11.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.11.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.11.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.11.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.2.attn.in_proj_bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.2.attn.in_proj_weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.2.attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.2.attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.2.ln_1.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.2.ln_1.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.2.ln_2.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.2.ln_2.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.2.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.2.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.2.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.2.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.3.attn.in_proj_bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.3.attn.in_proj_weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.3.attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.3.attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.3.ln_1.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.3.ln_1.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.3.ln_2.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.3.ln_2.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.3.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.3.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.3.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.3.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.4.attn.in_proj_bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.4.attn.in_proj_weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.4.attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.4.attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.4.ln_1.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.4.ln_1.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.4.ln_2.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.4.ln_2.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.4.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.4.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.4.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.4.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.5.attn.in_proj_bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.5.attn.in_proj_weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.5.attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.5.attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.5.ln_1.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.5.ln_1.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.5.ln_2.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.5.ln_2.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.5.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.5.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.5.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.5.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.6.attn.in_proj_bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.6.attn.in_proj_weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.6.attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.6.attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.6.ln_1.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.6.ln_1.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.6.ln_2.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.6.ln_2.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.6.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.6.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.6.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.6.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.7.attn.in_proj_bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.7.attn.in_proj_weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.7.attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.7.attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.7.ln_1.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.7.ln_1.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.7.ln_2.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.7.ln_2.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.7.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.7.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.7.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.7.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.8.attn.in_proj_bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.8.attn.in_proj_weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.8.attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.8.attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.8.ln_1.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.8.ln_1.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.8.ln_2.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.8.ln_2.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.8.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.8.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.8.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.8.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.9.attn.in_proj_bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.9.attn.in_proj_weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.9.attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.9.attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.9.ln_1.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.9.ln_1.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.9.ln_2.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.9.ln_2.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.9.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.9.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.9.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "condvit.transformer.resblocks.9.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.0.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.0.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.1.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.1.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.1.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.1.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.1.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.1.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.1.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.1.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.10.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.10.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.10.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.10.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.10.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.10.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.10.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.10.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.11.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.11.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.11.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.11.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.11.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.11.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.11.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.11.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.12.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.12.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.12.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.12.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.12.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.12.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.12.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.12.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.13.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.13.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.13.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.13.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.13.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.13.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.13.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.13.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.14.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.14.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.14.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.14.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.14.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.14.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.14.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.14.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.15.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.15.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.15.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.15.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.15.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.15.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.15.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.15.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.16.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.16.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.16.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.16.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.16.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.16.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.16.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.16.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.17.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.17.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.17.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.17.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.17.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.17.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.17.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.17.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.18.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.18.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.18.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.18.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.18.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.18.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.18.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.18.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.19.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.19.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.19.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.19.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.19.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.19.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.19.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.19.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.2.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.2.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.2.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.2.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.2.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.2.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.2.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.2.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.20.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.20.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.20.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.20.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.20.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.20.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.20.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.20.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.21.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.21.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.21.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.21.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.21.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.21.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.21.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.21.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.22.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.22.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.22.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.22.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.22.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.22.layer.1.DenseReluDense.wi.weight": "model-00002-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.22.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.22.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.23.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.23.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.23.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.23.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.23.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.23.layer.1.DenseReluDense.wi.weight": "model-00002-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.23.layer.1.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.23.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.3.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.3.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.3.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.3.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.3.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.3.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.3.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.3.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.4.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.4.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.4.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.4.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.4.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.4.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.4.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.4.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.5.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.5.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.5.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.5.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.5.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.5.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.5.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.5.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.6.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.6.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.6.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.6.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.6.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.6.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.6.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.6.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.7.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.7.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.7.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.7.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.7.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.7.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.7.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.7.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.8.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.8.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.8.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.8.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.8.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.8.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.8.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.8.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.9.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.9.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.9.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.9.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.9.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.9.layer.1.DenseReluDense.wi.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.9.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.block.9.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+    "lm.0.auto_model.encoder.final_layer_norm.weight": "model-00002-of-00002.safetensors",
+    "lm.0.auto_model.shared.weight": "model-00001-of-00002.safetensors",
+    "lm.2.linear.weight": "model-00002-of-00002.safetensors"
+  }
+}

module.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import torch
+from torch import nn
+from collections import OrderedDict
+import logging
+logger = logging.getLogger(__name__)
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        if self.weight.dtype != x.dtype:
+            orig_type = x.dtype
+            ret = super().forward(x.type(self.weight.dtype))
+            return ret.type(orig_type)
+        else:
+            return super().forward(x)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        attn_mask: torch.Tensor = None,
+    ):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict(
+                [
+                    (
+                        "c_fc",
+                        nn.Linear(d_model, d_model * 4),
+                    ),
+                    ("gelu", QuickGELU()),
+                    (
+                        "c_proj",
+                        nn.Linear(d_model * 4, d_model),
+                    ),
+                ]
+            )
+        )
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = (
+            self.attn_mask.to(dtype=x.dtype, device=x.device)
+            if self.attn_mask is not None
+            else None
+        )
+        return self.attn(
+            x,
+            x,
+            x,
+            need_weights=False,
+            attn_mask=self.attn_mask,
+        )[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        width: int,
+        layers: int,
+        heads: int,
+        attn_mask: torch.Tensor = None,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(
+            *[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]
+        )
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class ConditionalViT(nn.Module):
+    def __init__(
+        self,
+        input_resolution: int,
+        patch_size: int,
+        width: int,
+        layers: int,
+        heads: int,
+        output_dim: int,
+    ):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+        )
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.c_pos_embedding = nn.Parameter(scale * torch.randn(1, width))
+        self.positional_embedding = nn.Parameter(
+            scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width)
+        )
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.logit_scale = torch.nn.Parameter(torch.ones([]) * 4.6052)
+        self.proj = nn.Linear(width, output_dim, bias=False)
+    def forward(self, imgs: torch.Tensor, c: torch.Tensor = None):
+        """
+        imgs : Batch of images
+        c : Text embedding.
+        """
+        x = self.conv1(imgs)  # shape = [*, width, grid, grid]
+        # shape = [*, width, grid ** 2]
+        x = x.reshape(x.shape[0], x.shape[1], -1)
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        # Gather CLS, Grid, maybe CAT, and positional embedding
+        tokens = [self.class_embedding.tile(x.shape[0], 1, 1), x]  # NLD
+        pos_embed = [self.positional_embedding]  # LD
+        if c is not None:
+            pos_embed += [self.c_pos_embedding]  # +1D -> N1D
+            tokens += [c.unsqueeze(1)]
+        x = torch.cat(tokens, dim=1)  # shape = [*, grid ** 2 + 1|2, width] = N(L|L+1)D
+        pos_embed = torch.cat(pos_embed, dim=0).unsqueeze(0)  # 1(L|L+1)D
+        x = x + pos_embed
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        x = self.proj(x)
+        return x