fix vit json
Browse files- EVA02-CLIP-L-14-448.json +0 -29
- modeling_kangaroo.py +1 -2
- vision_tower_builder.py +22 -35
EVA02-CLIP-L-14-448.json
DELETED
|
@@ -1,29 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"embed_dim": 768,
|
| 3 |
-
"vision_cfg": {
|
| 4 |
-
"image_size": 448,
|
| 5 |
-
"layers": 24,
|
| 6 |
-
"width": 1024,
|
| 7 |
-
"drop_path_rate": 0,
|
| 8 |
-
"head_width": 64,
|
| 9 |
-
"mlp_ratio": 2.6667,
|
| 10 |
-
"patch_size": 14,
|
| 11 |
-
"eva_model_name": "eva-clip-l-14-448",
|
| 12 |
-
"xattn": true,
|
| 13 |
-
"fusedLN": true,
|
| 14 |
-
"rope": true,
|
| 15 |
-
"pt_hw_seq_len": 16,
|
| 16 |
-
"intp_freq": true,
|
| 17 |
-
"naiveswiglu": true,
|
| 18 |
-
"subln": true
|
| 19 |
-
},
|
| 20 |
-
"text_cfg": {
|
| 21 |
-
"context_length": 77,
|
| 22 |
-
"vocab_size": 49408,
|
| 23 |
-
"width": 768,
|
| 24 |
-
"heads": 12,
|
| 25 |
-
"layers": 12,
|
| 26 |
-
"xattn": false,
|
| 27 |
-
"fusedLN": true
|
| 28 |
-
}
|
| 29 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modeling_kangaroo.py
CHANGED
|
@@ -1069,9 +1069,8 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
|
|
| 1069 |
def __init__(self, config):
|
| 1070 |
super().__init__(config)
|
| 1071 |
self.model = LlamaModel(config)
|
| 1072 |
-
model_name = "EVA02-CLIP-L-14-448"
|
| 1073 |
self.vocab_size = config.vocab_size
|
| 1074 |
-
self.vision_tower = build_vision_tower(
|
| 1075 |
self.mm_projector = build_vision_projector(mm_hidden_size=self.vision_tower.num_features, hidden_size=config.hidden_size, projector_type="mlp2x_gelu")
|
| 1076 |
|
| 1077 |
self.vocab_size = config.vocab_size
|
|
|
|
| 1069 |
def __init__(self, config):
|
| 1070 |
super().__init__(config)
|
| 1071 |
self.model = LlamaModel(config)
|
|
|
|
| 1072 |
self.vocab_size = config.vocab_size
|
| 1073 |
+
self.vision_tower = build_vision_tower()
|
| 1074 |
self.mm_projector = build_vision_projector(mm_hidden_size=self.vision_tower.num_features, hidden_size=config.hidden_size, projector_type="mlp2x_gelu")
|
| 1075 |
|
| 1076 |
self.vocab_size = config.vocab_size
|
vision_tower_builder.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
| 3 |
# --------------------------------------------------------
|
| 4 |
import math
|
| 5 |
import os
|
| 6 |
-
import json
|
| 7 |
import logging
|
| 8 |
|
| 9 |
import torch
|
|
@@ -669,56 +668,46 @@ class EVAVisionTransformer(nn.Module):
|
|
| 669 |
|
| 670 |
@dataclass
|
| 671 |
class CLIPVisionCfg:
|
| 672 |
-
|
| 673 |
-
|
|
|
|
| 674 |
head_width: int = 64
|
| 675 |
-
mlp_ratio: float =
|
| 676 |
-
patch_size: int =
|
| 677 |
-
image_size: Union[Tuple[int, int], int] =
|
| 678 |
ls_init_value: Optional[float] = None # layer scale initial value
|
| 679 |
patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
|
| 680 |
global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
|
| 681 |
-
drop_path_rate: Optional[float] =
|
| 682 |
timm_model_name: str = None # a valid model name overrides layers, width, patch_size
|
| 683 |
timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model
|
| 684 |
timm_pool: str = 'avg' # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
|
| 685 |
timm_proj: str = 'linear' # linear projection for timm model output ('linear', 'mlp', '')
|
| 686 |
timm_proj_bias: bool = False # enable bias final projection
|
| 687 |
-
eva_model_name: str =
|
| 688 |
qkv_bias: bool = True
|
| 689 |
fusedLN: bool = False
|
| 690 |
-
xattn: bool =
|
| 691 |
postnorm: bool = False
|
| 692 |
-
rope: bool =
|
| 693 |
pt_hw_seq_len: int = 16 # 224/14
|
| 694 |
-
intp_freq: bool =
|
| 695 |
-
naiveswiglu: bool =
|
| 696 |
-
subln: bool =
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
def build_vision_tower(
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
if isinstance(device, str):
|
| 705 |
-
device = torch.device(device)
|
| 706 |
-
|
| 707 |
-
model_cfg = json.load(open(model_name + '.json'))
|
| 708 |
-
if 'rope' in model_cfg.get('vision_cfg', {}):
|
| 709 |
-
if model_cfg['vision_cfg']['rope']:
|
| 710 |
os.environ['RoPE'] = "1"
|
| 711 |
else:
|
| 712 |
os.environ['RoPE'] = "0"
|
| 713 |
|
| 714 |
-
vision_cfg = CLIPVisionCfg(**model_cfg['vision_cfg'])
|
| 715 |
|
| 716 |
if vision_cfg.fusedLN:
|
| 717 |
-
|
| 718 |
-
from apex.normalization import FusedLayerNorm
|
| 719 |
-
except:
|
| 720 |
-
FusedLayerNorm = LayerNorm
|
| 721 |
-
print("Please 'pip install apex'")
|
| 722 |
norm_layer = partial(FusedLayerNorm, eps=1e-6)
|
| 723 |
else:
|
| 724 |
norm_layer = partial(LayerNorm, eps=1e-6)
|
|
@@ -726,7 +715,7 @@ def build_vision_tower(
|
|
| 726 |
vision_tower = EVAVisionTransformer(
|
| 727 |
img_size = vision_cfg.image_size,
|
| 728 |
patch_size = vision_cfg.patch_size,
|
| 729 |
-
num_classes =
|
| 730 |
use_mean_pooling = vision_cfg.global_average_pool,
|
| 731 |
init_values = vision_cfg.ls_init_value,
|
| 732 |
patch_dropout = vision_cfg.patch_dropout,
|
|
@@ -750,8 +739,6 @@ def build_vision_tower(
|
|
| 750 |
logging.info(f'convert precision to {precision}')
|
| 751 |
vision_tower = vision_tower.to(torch.bfloat16) if 'bf16' in precision else vision_tower.to(torch.float16)
|
| 752 |
|
| 753 |
-
vision_tower.to(device=device)
|
| 754 |
-
|
| 755 |
vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
|
| 756 |
vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)
|
| 757 |
|
|
|
|
| 3 |
# --------------------------------------------------------
|
| 4 |
import math
|
| 5 |
import os
|
|
|
|
| 6 |
import logging
|
| 7 |
|
| 8 |
import torch
|
|
|
|
| 668 |
|
| 669 |
@dataclass
|
| 670 |
class CLIPVisionCfg:
|
| 671 |
+
embed_dim: int = 768
|
| 672 |
+
layers: Union[Tuple[int, int, int, int], int] = 24
|
| 673 |
+
width: int = 1024
|
| 674 |
head_width: int = 64
|
| 675 |
+
mlp_ratio: float = 2.6667
|
| 676 |
+
patch_size: int = 14
|
| 677 |
+
image_size: Union[Tuple[int, int], int] = 448
|
| 678 |
ls_init_value: Optional[float] = None # layer scale initial value
|
| 679 |
patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
|
| 680 |
global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
|
| 681 |
+
drop_path_rate: Optional[float] = 0 # drop path rate
|
| 682 |
timm_model_name: str = None # a valid model name overrides layers, width, patch_size
|
| 683 |
timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model
|
| 684 |
timm_pool: str = 'avg' # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
|
| 685 |
timm_proj: str = 'linear' # linear projection for timm model output ('linear', 'mlp', '')
|
| 686 |
timm_proj_bias: bool = False # enable bias final projection
|
| 687 |
+
eva_model_name: str = "eva-clip-l-14-448" # a valid eva model name overrides layers, width, patch_size
|
| 688 |
qkv_bias: bool = True
|
| 689 |
fusedLN: bool = False
|
| 690 |
+
xattn: bool = True
|
| 691 |
postnorm: bool = False
|
| 692 |
+
rope: bool = True
|
| 693 |
pt_hw_seq_len: int = 16 # 224/14
|
| 694 |
+
intp_freq: bool = True
|
| 695 |
+
naiveswiglu: bool = True
|
| 696 |
+
subln: bool = True
|
| 697 |
+
|
| 698 |
+
|
| 699 |
+
def build_vision_tower(precision: str = 'bf16'):
|
| 700 |
+
|
| 701 |
+
vision_cfg = CLIPVisionCfg()
|
| 702 |
+
|
| 703 |
+
if vision_cfg.rope:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 704 |
os.environ['RoPE'] = "1"
|
| 705 |
else:
|
| 706 |
os.environ['RoPE'] = "0"
|
| 707 |
|
|
|
|
| 708 |
|
| 709 |
if vision_cfg.fusedLN:
|
| 710 |
+
from apex.normalization import FusedLayerNorm
|
|
|
|
|
|
|
|
|
|
|
|
|
| 711 |
norm_layer = partial(FusedLayerNorm, eps=1e-6)
|
| 712 |
else:
|
| 713 |
norm_layer = partial(LayerNorm, eps=1e-6)
|
|
|
|
| 715 |
vision_tower = EVAVisionTransformer(
|
| 716 |
img_size = vision_cfg.image_size,
|
| 717 |
patch_size = vision_cfg.patch_size,
|
| 718 |
+
num_classes = vision_cfg.embed_dim,
|
| 719 |
use_mean_pooling = vision_cfg.global_average_pool,
|
| 720 |
init_values = vision_cfg.ls_init_value,
|
| 721 |
patch_dropout = vision_cfg.patch_dropout,
|
|
|
|
| 739 |
logging.info(f'convert precision to {precision}')
|
| 740 |
vision_tower = vision_tower.to(torch.bfloat16) if 'bf16' in precision else vision_tower.to(torch.float16)
|
| 741 |
|
|
|
|
|
|
|
| 742 |
vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
|
| 743 |
vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)
|
| 744 |
|