KangarooGroup
/

kangaroo

@@ -1,29 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "image_size": 448,
-        "layers": 24,
-        "width": 1024,
-        "drop_path_rate": 0,
-        "head_width": 64,
-        "mlp_ratio": 2.6667,
-        "patch_size": 14,
-        "eva_model_name": "eva-clip-l-14-448",
-        "xattn": true,
-        "fusedLN": true,
-        "rope": true,
-        "pt_hw_seq_len": 16,
-        "intp_freq": true,
-        "naiveswiglu": true,
-        "subln": true
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12,
-        "xattn": false,
-        "fusedLN": true
-    }
-}

modeling_kangaroo.py CHANGED Viewed

@@ -1069,9 +1069,8 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.model = LlamaModel(config)
-        model_name = "EVA02-CLIP-L-14-448"
         self.vocab_size = config.vocab_size
-        self.vision_tower = build_vision_tower(model_name)
         self.mm_projector = build_vision_projector(mm_hidden_size=self.vision_tower.num_features, hidden_size=config.hidden_size, projector_type="mlp2x_gelu")
         self.vocab_size = config.vocab_size

     def __init__(self, config):
         super().__init__(config)
         self.model = LlamaModel(config)
         self.vocab_size = config.vocab_size
+        self.vision_tower = build_vision_tower()
         self.mm_projector = build_vision_projector(mm_hidden_size=self.vision_tower.num_features, hidden_size=config.hidden_size, projector_type="mlp2x_gelu")
         self.vocab_size = config.vocab_size

vision_tower_builder.py CHANGED Viewed

@@ -3,7 +3,6 @@
 # --------------------------------------------------------
 import math
 import os
-import json
 import logging
 import torch
@@ -669,56 +668,46 @@ class EVAVisionTransformer(nn.Module):
 @dataclass
 class CLIPVisionCfg:
-    layers: Union[Tuple[int, int, int, int], int] = 12
-    width: int = 768
     head_width: int = 64
-    mlp_ratio: float = 4.0
-    patch_size: int = 16
-    image_size: Union[Tuple[int, int], int] = 224
     ls_init_value: Optional[float] = None  # layer scale initial value
     patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
     global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
-    drop_path_rate: Optional[float] = None  # drop path rate
     timm_model_name: str = None  # a valid model name overrides layers, width, patch_size
     timm_model_pretrained: bool = False  # use (imagenet) pretrained weights for named model
     timm_pool: str = 'avg'  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
     timm_proj: str = 'linear'  # linear projection for timm model output ('linear', 'mlp', '')
     timm_proj_bias: bool = False  # enable bias final projection
-    eva_model_name: str = None # a valid eva model name overrides layers, width, patch_size
     qkv_bias: bool = True
     fusedLN: bool = False
-    xattn: bool = False
     postnorm: bool = False
-    rope: bool = False
     pt_hw_seq_len: int = 16   # 224/14
-    intp_freq: bool = False
-    naiveswiglu: bool = False
-    subln: bool = False
-def build_vision_tower(
-        model_name: str,
-        precision: str = 'bf16',
-        device: Union[str, torch.device] = 'cpu',
-):
-    if isinstance(device, str):
-        device = torch.device(device)
-    model_cfg = json.load(open(model_name + '.json'))
-    if 'rope' in model_cfg.get('vision_cfg', {}):
-        if model_cfg['vision_cfg']['rope']:
             os.environ['RoPE'] = "1"
     else:
         os.environ['RoPE'] = "0"
-    vision_cfg = CLIPVisionCfg(**model_cfg['vision_cfg'])
     if vision_cfg.fusedLN:
-        try:
-            from apex.normalization import FusedLayerNorm
-        except:
-            FusedLayerNorm = LayerNorm
-            print("Please 'pip install apex'")
         norm_layer = partial(FusedLayerNorm, eps=1e-6)
     else:
         norm_layer = partial(LayerNorm, eps=1e-6)
@@ -726,7 +715,7 @@ def build_vision_tower(
     vision_tower = EVAVisionTransformer(
         img_size = vision_cfg.image_size,
         patch_size = vision_cfg.patch_size,
-        num_classes = model_cfg['embed_dim'],
         use_mean_pooling = vision_cfg.global_average_pool,
         init_values = vision_cfg.ls_init_value,
         patch_dropout = vision_cfg.patch_dropout,
@@ -750,8 +739,6 @@ def build_vision_tower(
         logging.info(f'convert precision to {precision}')
         vision_tower = vision_tower.to(torch.bfloat16) if 'bf16' in precision else vision_tower.to(torch.float16)
-    vision_tower.to(device=device)
     vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
     vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)

 # --------------------------------------------------------
 import math
 import os
 import logging
 import torch
 @dataclass
 class CLIPVisionCfg:
+    embed_dim: int = 768
+    layers: Union[Tuple[int, int, int, int], int] = 24
+    width: int = 1024
     head_width: int = 64
+    mlp_ratio: float = 2.6667
+    patch_size: int = 14
+    image_size: Union[Tuple[int, int], int] = 448
     ls_init_value: Optional[float] = None  # layer scale initial value
     patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
     global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
+    drop_path_rate: Optional[float] = 0  # drop path rate
     timm_model_name: str = None  # a valid model name overrides layers, width, patch_size
     timm_model_pretrained: bool = False  # use (imagenet) pretrained weights for named model
     timm_pool: str = 'avg'  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
     timm_proj: str = 'linear'  # linear projection for timm model output ('linear', 'mlp', '')
     timm_proj_bias: bool = False  # enable bias final projection
+    eva_model_name: str = "eva-clip-l-14-448" # a valid eva model name overrides layers, width, patch_size
     qkv_bias: bool = True
     fusedLN: bool = False
+    xattn: bool = True
     postnorm: bool = False
+    rope: bool = True
     pt_hw_seq_len: int = 16   # 224/14
+    intp_freq: bool = True
+    naiveswiglu: bool = True
+    subln: bool = True
+def build_vision_tower(precision: str = 'bf16'):
+    vision_cfg = CLIPVisionCfg()
+    if vision_cfg.rope:
             os.environ['RoPE'] = "1"
     else:
         os.environ['RoPE'] = "0"
     if vision_cfg.fusedLN:
+        from apex.normalization import FusedLayerNorm
         norm_layer = partial(FusedLayerNorm, eps=1e-6)
     else:
         norm_layer = partial(LayerNorm, eps=1e-6)
     vision_tower = EVAVisionTransformer(
         img_size = vision_cfg.image_size,
         patch_size = vision_cfg.patch_size,
+        num_classes = vision_cfg.embed_dim,
         use_mean_pooling = vision_cfg.global_average_pool,
         init_values = vision_cfg.ls_init_value,
         patch_dropout = vision_cfg.patch_dropout,
         logging.info(f'convert precision to {precision}')
         vision_tower = vision_tower.to(torch.bfloat16) if 'bf16' in precision else vision_tower.to(torch.float16)
     vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
     vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)