Adi Raz Goldfarb [email protected] commited on
Commit
345c9a7
·
1 Parent(s): a6750a3

granite3.3

Browse files
.ipynb_checkpoints/config-checkpoint.json ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ibm-granite/granite-vision-3.3-2b",
3
+ "adapter_path": null,
4
+ "auto_map": {
5
+ "AutoModel": "modeling_colgranitevision.ColGraniteVision",
6
+ "AutoProcessor": "processing_colgranitevision.ColGraniteVisionProcessor",
7
+ "AutoConfig": "colgranitevision_config.ColGraniteVisionConfig"
8
+ },
9
+ "architectures": [
10
+ "ColGraniteVision"
11
+ ],
12
+ "base_model": null,
13
+ "emb_dim_doc": 128,
14
+ "emb_dim_query": 128,
15
+ "image_grid_pinpoints": [
16
+ [
17
+ 384,
18
+ 768
19
+ ],
20
+ [
21
+ 384,
22
+ 1152
23
+ ],
24
+ [
25
+ 384,
26
+ 1536
27
+ ],
28
+ [
29
+ 384,
30
+ 1920
31
+ ],
32
+ [
33
+ 384,
34
+ 2304
35
+ ],
36
+ [
37
+ 384,
38
+ 2688
39
+ ],
40
+ [
41
+ 384,
42
+ 3072
43
+ ],
44
+ [
45
+ 384,
46
+ 3456
47
+ ],
48
+ [
49
+ 384,
50
+ 3840
51
+ ],
52
+ [
53
+ 768,
54
+ 384
55
+ ],
56
+ [
57
+ 768,
58
+ 768
59
+ ],
60
+ [
61
+ 768,
62
+ 1152
63
+ ],
64
+ [
65
+ 768,
66
+ 1536
67
+ ],
68
+ [
69
+ 768,
70
+ 1920
71
+ ],
72
+ [
73
+ 1152,
74
+ 384
75
+ ],
76
+ [
77
+ 1152,
78
+ 768
79
+ ],
80
+ [
81
+ 1152,
82
+ 1152
83
+ ],
84
+ [
85
+ 1536,
86
+ 384
87
+ ],
88
+ [
89
+ 1536,
90
+ 768
91
+ ],
92
+ [
93
+ 1920,
94
+ 384
95
+ ],
96
+ [
97
+ 1920,
98
+ 768
99
+ ],
100
+ [
101
+ 2304,
102
+ 384
103
+ ],
104
+ [
105
+ 2688,
106
+ 384
107
+ ],
108
+ [
109
+ 3072,
110
+ 384
111
+ ],
112
+ [
113
+ 3456,
114
+ 384
115
+ ],
116
+ [
117
+ 3840,
118
+ 384
119
+ ]
120
+ ],
121
+ "image_seq_length": 576,
122
+ "image_token_index": 49155,
123
+ "model_type": "colgranitevision",
124
+ "multimodal_projector_bias": true,
125
+ "pretrained_language_model": "",
126
+ "pretrained_vision_tower": "",
127
+ "projector_hidden_act": "gelu",
128
+ "text_config": {
129
+ "_attn_implementation_autoset": true,
130
+ "_name_or_path": "ibm-granite/granite-3.1-2b-instruct",
131
+ "architectures": [
132
+ "GraniteForCausalLM"
133
+ ],
134
+ "attention_dropout": 0.1,
135
+ "attention_multiplier": 0.015625,
136
+ "bos_token_id": 0,
137
+ "embedding_multiplier": 12.0,
138
+ "eos_token_id": 0,
139
+ "hidden_size": 2048,
140
+ "intermediate_size": 8192,
141
+ "logits_scaling": 8.0,
142
+ "max_position_embeddings": 131072,
143
+ "model_type": "granite",
144
+ "num_hidden_layers": 40,
145
+ "num_key_value_heads": 8,
146
+ "pad_token_id": 0,
147
+ "residual_multiplier": 0.22,
148
+ "rms_norm_eps": 1e-05,
149
+ "rope_theta": 300000,
150
+ "tie_word_embeddings": true,
151
+ "torch_dtype": "float32",
152
+ "vocab_size": 49156
153
+ },
154
+ "tie_word_embeddings": true,
155
+ "torch_dtype": "float32",
156
+ "transformers_version": "4.50.0.dev0",
157
+ "use_image_newline_parameter": true,
158
+ "vision_config": {
159
+ "_attn_implementation_autoset": true,
160
+ "hidden_act": "gelu_pytorch_tanh",
161
+ "hidden_size": 1152,
162
+ "image_size": 384,
163
+ "intermediate_size": 4304,
164
+ "layer_norm_eps": 1e-06,
165
+ "model_type": "siglip_vision_model",
166
+ "num_attention_heads": 16,
167
+ "num_hidden_layers": 27,
168
+ "patch_size": 14,
169
+ "torch_dtype": "float32"
170
+ },
171
+ "vision_feature_layer": [
172
+ -24,
173
+ -20,
174
+ -12,
175
+ -1
176
+ ],
177
+ "vision_feature_select_strategy": "full"
178
+ }
.ipynb_checkpoints/modeling_colgranitevision-checkpoint.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import ClassVar, Optional
2
+
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from transformers import LlavaNextPreTrainedModel
7
+ from transformers.models.llava_next.modeling_llava_next import LlavaNextForConditionalGeneration
8
+ from transformers.models.llava_next.modeling_llava_next import unpad_image, get_anyres_image_grid_shape
9
+
10
+ from .colgranitevision_config import ColGraniteVisionConfig
11
+
12
+
13
+ class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
14
+ def pack_image_features(
15
+ self,
16
+ image_features,
17
+ image_sizes,
18
+ vision_feature_select_strategy,
19
+ image_newline=None,
20
+ base_image_feature_location="last",
21
+ ):
22
+ """
23
+ Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
24
+
25
+ Args:
26
+ image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
27
+ List of image feature tensor, each contains all the visual feature of all patches.
28
+ image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
29
+ Actual image size of each images (H, W).
30
+ vision_feature_select_strategy (`str`)
31
+ The feature selection strategy used to select the vision feature from the vision backbone.
32
+ image_newline (`torch.Tensor` of shape `(embed_dim)`)
33
+ New line embedding vector.
34
+ Returns:
35
+ image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
36
+ feature_lens (`List[int]`)
37
+ token length of each image in image_features
38
+ """
39
+
40
+ new_image_features = []
41
+ feature_lens = []
42
+ for image_idx, image_feature in enumerate(image_features):
43
+ if image_feature.shape[0] > 1:
44
+ base_image_feature = image_feature[0]
45
+ image_feature = image_feature[1:]
46
+ height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
47
+
48
+ num_patch_height, num_patch_width = get_anyres_image_grid_shape(
49
+ image_sizes[image_idx],
50
+ self.config.image_grid_pinpoints,
51
+ self.config.vision_config.image_size,
52
+ )
53
+
54
+ if (
55
+ np.prod(image_feature.shape) % (num_patch_height * num_patch_width * height * width) != 0
56
+ and vision_feature_select_strategy == "default"
57
+ ):
58
+ print(
59
+ "Image feature shape does not line up with the provided patch size. "
60
+ "You may be using the `default` vision_feature_select_strategy with a"
61
+ " visual encoder that does not have CLS."
62
+ )
63
+
64
+ image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
65
+ image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
66
+ image_feature = image_feature.flatten(1, 2).flatten(2, 3)
67
+ image_feature = unpad_image(image_feature, image_sizes[image_idx])
68
+ if image_newline is not None:
69
+ image_feature = torch.cat(
70
+ (
71
+ image_feature,
72
+ image_newline[:, None, None]
73
+ .expand(*image_feature.shape[:-1], 1)
74
+ .to(image_feature.device, image_feature.dtype),
75
+ ),
76
+ dim=-1,
77
+ )
78
+ image_feature = image_feature.flatten(1, 2).transpose(0, 1)
79
+ if base_image_feature_location == "last":
80
+ image_feature = torch.cat((image_feature, base_image_feature), dim=0)
81
+ else:
82
+ image_feature = torch.cat((base_image_feature, image_feature), dim=0)
83
+
84
+ else:
85
+ image_feature = image_feature[0]
86
+ if image_newline is not None:
87
+ image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
88
+ new_image_features.append(image_feature)
89
+ feature_lens.append(image_feature.size(0))
90
+ image_features = torch.cat(new_image_features, dim=0)
91
+ feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features.device)
92
+ return image_features, feature_lens
93
+
94
+
95
+ class ColGraniteVision(LlavaNextPreTrainedModel):
96
+ """
97
+ ColGraniteVision model implementation.
98
+ """
99
+
100
+ main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related
101
+ config_class = ColGraniteVisionConfig
102
+
103
+ def __init__(self, config: ColGraniteVisionConfig):
104
+ super().__init__(config=config)
105
+
106
+ model = LlavaNextWithCustomPacking(config=config)
107
+ if model.language_model._tied_weights_keys is not None:
108
+ self._tied_weights_keys = [f"model.language_model.{k}" for k in model.language_model._tied_weights_keys]
109
+ self.model = model
110
+
111
+ # TODO: Wait for ColPali2 to create a ColPaliConfig to allow specifying the embedding dimension.
112
+ # We could do it now but it would break all the models trying to load the model from the checkpoint.
113
+ self.dim = 128
114
+ self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
115
+
116
+ self.post_init()
117
+
118
+ def forward(self, *args, **kwargs) -> torch.Tensor:
119
+ # Delete output_hidden_states from kwargs
120
+ kwargs.pop("output_hidden_states", None)
121
+ if "pixel_values" in kwargs:
122
+ kwargs["pixel_values"] = kwargs["pixel_values"].to(dtype=self.dtype)
123
+
124
+ outputs = self.model(*args, output_hidden_states=True, **kwargs) # (batch_size, sequence_length, hidden_size)
125
+ last_hidden_states = outputs.hidden_states[-1] # (batch_size, sequence_length, hidden_size)
126
+
127
+ attention_mask = kwargs["attention_mask"]
128
+ if "pixel_values" in kwargs:
129
+ input_ids = kwargs['input_ids']
130
+ image_mask = (input_ids == self.config.image_token_index)
131
+ # inputs_embeds = last_hidden_states.masked_scatter(image_mask)
132
+ N, M = image_mask.shape
133
+ # Create an index matrix: each row is 0, 1, ..., M-1
134
+ idx = torch.arange(M, device=image_mask.device).expand(N, M)
135
+ # Replace False positions with -1 so they are ignored by topk (since all valid indices are >=0)
136
+ masked_idx = torch.where(image_mask, idx, torch.tensor(-1, device=image_mask.device))
137
+ topk_values, _ = torch.topk(masked_idx, k=729, dim=1)
138
+ last_k_indices, _ = torch.sort(topk_values, dim=1)
139
+ last_k_indices_exp = last_k_indices.unsqueeze(-1).expand(-1, -1, last_hidden_states.size(-1))
140
+ last_hidden_states = torch.gather(last_hidden_states, 1, last_k_indices_exp)
141
+ attention_mask = torch.gather(attention_mask, 1, last_k_indices)
142
+
143
+ attention_mask = attention_mask.unsqueeze(-1)
144
+
145
+ proj = self.custom_text_proj(last_hidden_states) # (batch_size, sequence_length, dim)
146
+
147
+ # L2 normalization
148
+ proj = proj / (proj.norm(dim=-1, keepdim=True) + 1e-8)
149
+
150
+ # proj = proj * kwargs["attention_mask"].unsqueeze(-1) # (batch_size, sequence_length, dim)
151
+ proj = proj * attention_mask # (batch_size, sequence_length, dim)
152
+
153
+ return proj
154
+
155
+ def get_input_embeddings(self):
156
+ return self.model.language_model.get_input_embeddings()
157
+
158
+ def set_input_embeddings(self, value):
159
+ self.model.language_model.set_input_embeddings(value)
160
+
161
+ def get_output_embeddings(self):
162
+ return self.model.language_model.get_output_embeddings()
163
+
164
+ def set_output_embeddings(self, new_embeddings):
165
+ self.model.language_model.set_output_embeddings(new_embeddings)
166
+
167
+ def set_decoder(self, decoder):
168
+ self.model.language_model.set_decoder(decoder)
169
+
170
+ def get_decoder(self):
171
+ return self.model.language_model.get_decoder()
172
+
173
+ def tie_weights(self):
174
+ return self.model.language_model.tie_weights()
175
+
176
+ def resize_token_embeddings(
177
+ self,
178
+ new_num_tokens: Optional[int] = None,
179
+ pad_to_multiple_of=None,
180
+ ) -> nn.Embedding:
181
+ model_embeds = self.model.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
182
+
183
+ # Update vocab size
184
+ self.config.text_config.vocab_size = model_embeds.num_embeddings
185
+ self.config.vocab_size = model_embeds.num_embeddings
186
+ self.model.vocab_size = model_embeds.num_embeddings
187
+
188
+ return model_embeds
189
+
190
+ @property
191
+ def patch_size(self) -> int:
192
+ return self.model.vision_tower.config.patch_size
added_tokens.json CHANGED
@@ -1,6 +1,6 @@
1
- {
2
- "<image>": 49155,
3
- "<|end_of_role|>": 49153,
4
- "<|start_of_role|>": 49152,
5
- "<|tool_call|>": 49154
6
- }
 
1
+ {
2
+ "<image>": 49155,
3
+ "<|end_of_role|>": 49153,
4
+ "<|start_of_role|>": 49152,
5
+ "<|tool_call|>": 49154
6
+ }
config.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
- "_name_or_path": "ibm-granite/granite-vision-3.1-2b-preview",
3
- "_class_name": "ColGraniteVisionConfig",
4
  "auto_map": {
5
- "AutoModel": "modeling_colgranitevision.ColGraniteVision",
6
- "AutoProcessor": "processing_colgranitevision.ColGraniteVisionProcessor",
7
- "AutoConfig": "colgranitevision_config.ColGraniteVisionConfig"
8
- },
9
  "architectures": [
10
  "ColGraniteVision"
11
  ],
@@ -122,29 +122,33 @@
122
  "image_token_index": 49155,
123
  "model_type": "colgranitevision",
124
  "multimodal_projector_bias": true,
 
 
125
  "projector_hidden_act": "gelu",
126
  "text_config": {
 
 
127
  "architectures": [
128
  "GraniteForCausalLM"
129
  ],
130
  "attention_dropout": 0.1,
131
  "attention_multiplier": 0.015625,
132
  "bos_token_id": 0,
133
- "embedding_multiplier": 12,
134
  "eos_token_id": 0,
135
  "hidden_size": 2048,
136
  "intermediate_size": 8192,
137
- "logits_scaling": 8,
138
- "max_position_embeddings": 16384,
139
  "model_type": "granite",
140
  "num_hidden_layers": 40,
141
  "num_key_value_heads": 8,
142
  "pad_token_id": 0,
143
  "residual_multiplier": 0.22,
144
- "rms_norm_eps": 0.00001,
145
  "rope_theta": 300000,
146
  "tie_word_embeddings": true,
147
- "torch_dtype": "bfloat16",
148
  "vocab_size": 49156
149
  },
150
  "tie_word_embeddings": true,
@@ -152,15 +156,17 @@
152
  "transformers_version": "4.50.0.dev0",
153
  "use_image_newline_parameter": true,
154
  "vision_config": {
 
155
  "hidden_act": "gelu_pytorch_tanh",
156
  "hidden_size": 1152,
157
  "image_size": 384,
158
  "intermediate_size": 4304,
159
- "layer_norm_eps": 0.000001,
160
  "model_type": "siglip_vision_model",
161
  "num_attention_heads": 16,
162
  "num_hidden_layers": 27,
163
- "patch_size": 14
 
164
  },
165
  "vision_feature_layer": [
166
  -24,
@@ -169,4 +175,4 @@
169
  -1
170
  ],
171
  "vision_feature_select_strategy": "full"
172
- }
 
1
  {
2
+ "_name_or_path": "ibm-granite/granite-vision-3.3-2b",
3
+ "adapter_path": null,
4
  "auto_map": {
5
+ "AutoModel": "modeling_colgranitevision.ColGraniteVision",
6
+ "AutoProcessor": "processing_colgranitevision.ColGraniteVisionProcessor",
7
+ "AutoConfig": "colgranitevision_config.ColGraniteVisionConfig"
8
+ },
9
  "architectures": [
10
  "ColGraniteVision"
11
  ],
 
122
  "image_token_index": 49155,
123
  "model_type": "colgranitevision",
124
  "multimodal_projector_bias": true,
125
+ "pretrained_language_model": "",
126
+ "pretrained_vision_tower": "",
127
  "projector_hidden_act": "gelu",
128
  "text_config": {
129
+ "_attn_implementation_autoset": true,
130
+ "_name_or_path": "ibm-granite/granite-3.1-2b-instruct",
131
  "architectures": [
132
  "GraniteForCausalLM"
133
  ],
134
  "attention_dropout": 0.1,
135
  "attention_multiplier": 0.015625,
136
  "bos_token_id": 0,
137
+ "embedding_multiplier": 12.0,
138
  "eos_token_id": 0,
139
  "hidden_size": 2048,
140
  "intermediate_size": 8192,
141
+ "logits_scaling": 8.0,
142
+ "max_position_embeddings": 131072,
143
  "model_type": "granite",
144
  "num_hidden_layers": 40,
145
  "num_key_value_heads": 8,
146
  "pad_token_id": 0,
147
  "residual_multiplier": 0.22,
148
+ "rms_norm_eps": 1e-05,
149
  "rope_theta": 300000,
150
  "tie_word_embeddings": true,
151
+ "torch_dtype": "float32",
152
  "vocab_size": 49156
153
  },
154
  "tie_word_embeddings": true,
 
156
  "transformers_version": "4.50.0.dev0",
157
  "use_image_newline_parameter": true,
158
  "vision_config": {
159
+ "_attn_implementation_autoset": true,
160
  "hidden_act": "gelu_pytorch_tanh",
161
  "hidden_size": 1152,
162
  "image_size": 384,
163
  "intermediate_size": 4304,
164
+ "layer_norm_eps": 1e-06,
165
  "model_type": "siglip_vision_model",
166
  "num_attention_heads": 16,
167
  "num_hidden_layers": 27,
168
+ "patch_size": 14,
169
+ "torch_dtype": "float32"
170
  },
171
  "vision_feature_layer": [
172
  -24,
 
175
  -1
176
  ],
177
  "vision_feature_select_strategy": "full"
178
+ }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec8a694db663b30616ff06812d60256bb474c52051df2003faaec47c42b9a556
3
  size 4955415688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e838b6d98f48fbf45ae6c0d9c74cba649fd06b27ed78ced3971efbab7e16a69
3
  size 4955415688
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9df92d92a0d79465e4ee5eb57a51ee1630b159dc5833e26af9ca7bc9b3788d24
3
  size 4999979448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6bf1675fc15977b4d8f37ea1d4960ca2750e6793a80da9771e4693ae8cb13d6
3
  size 4999979448
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42ebe0fe87507de69b86074af24513756fbcc205e83ccb2ee7bbe9238a751f29
3
  size 1947355456
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15978cba0606360676faad5c3cf486a58e6d78a1352dbfcd1db51a7410a574d5
3
  size 1947355456
preprocessor_config.json CHANGED
@@ -1,136 +1,137 @@
1
- {
2
- "crop_size": {
3
- "height": 384,
4
- "width": 384
5
- },
6
- "do_center_crop": true,
7
- "do_convert_rgb": null,
8
- "do_normalize": true,
9
- "do_pad": true,
10
- "do_rescale": true,
11
- "do_resize": true,
12
- "image_grid_pinpoints": [
13
- [
14
- 384,
15
- 768
16
- ],
17
- [
18
- 384,
19
- 1152
20
- ],
21
- [
22
- 384,
23
- 1536
24
- ],
25
- [
26
- 384,
27
- 1920
28
- ],
29
- [
30
- 384,
31
- 2304
32
- ],
33
- [
34
- 384,
35
- 2688
36
- ],
37
- [
38
- 384,
39
- 3072
40
- ],
41
- [
42
- 384,
43
- 3456
44
- ],
45
- [
46
- 384,
47
- 3840
48
- ],
49
- [
50
- 768,
51
- 384
52
- ],
53
- [
54
- 768,
55
- 768
56
- ],
57
- [
58
- 768,
59
- 1152
60
- ],
61
- [
62
- 768,
63
- 1536
64
- ],
65
- [
66
- 768,
67
- 1920
68
- ],
69
- [
70
- 1152,
71
- 384
72
- ],
73
- [
74
- 1152,
75
- 768
76
- ],
77
- [
78
- 1152,
79
- 1152
80
- ],
81
- [
82
- 1536,
83
- 384
84
- ],
85
- [
86
- 1536,
87
- 768
88
- ],
89
- [
90
- 1920,
91
- 384
92
- ],
93
- [
94
- 1920,
95
- 768
96
- ],
97
- [
98
- 2304,
99
- 384
100
- ],
101
- [
102
- 2688,
103
- 384
104
- ],
105
- [
106
- 3072,
107
- 384
108
- ],
109
- [
110
- 3456,
111
- 384
112
- ],
113
- [
114
- 3840,
115
- 384
116
- ]
117
- ],
118
- "image_mean": [
119
- 0.5,
120
- 0.5,
121
- 0.5
122
- ],
123
- "image_processor_type": "LlavaNextImageProcessor",
124
- "image_std": [
125
- 0.5,
126
- 0.5,
127
- 0.5
128
- ],
129
- "processor_class": "ColGraniteVisionProcessor",
130
- "resample": 3,
131
- "rescale_factor": 0.00392156862745098,
132
- "size": {
133
- "height": 384,
134
- "width": 384
135
- }
136
- }
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 384,
4
+ "width": 384
5
+ },
6
+ "default_to_square": false,
7
+ "do_center_crop": true,
8
+ "do_convert_rgb": null,
9
+ "do_normalize": true,
10
+ "do_pad": true,
11
+ "do_rescale": true,
12
+ "do_resize": true,
13
+ "image_grid_pinpoints": [
14
+ [
15
+ 384,
16
+ 768
17
+ ],
18
+ [
19
+ 384,
20
+ 1152
21
+ ],
22
+ [
23
+ 384,
24
+ 1536
25
+ ],
26
+ [
27
+ 384,
28
+ 1920
29
+ ],
30
+ [
31
+ 384,
32
+ 2304
33
+ ],
34
+ [
35
+ 384,
36
+ 2688
37
+ ],
38
+ [
39
+ 384,
40
+ 3072
41
+ ],
42
+ [
43
+ 384,
44
+ 3456
45
+ ],
46
+ [
47
+ 384,
48
+ 3840
49
+ ],
50
+ [
51
+ 768,
52
+ 384
53
+ ],
54
+ [
55
+ 768,
56
+ 768
57
+ ],
58
+ [
59
+ 768,
60
+ 1152
61
+ ],
62
+ [
63
+ 768,
64
+ 1536
65
+ ],
66
+ [
67
+ 768,
68
+ 1920
69
+ ],
70
+ [
71
+ 1152,
72
+ 384
73
+ ],
74
+ [
75
+ 1152,
76
+ 768
77
+ ],
78
+ [
79
+ 1152,
80
+ 1152
81
+ ],
82
+ [
83
+ 1536,
84
+ 384
85
+ ],
86
+ [
87
+ 1536,
88
+ 768
89
+ ],
90
+ [
91
+ 1920,
92
+ 384
93
+ ],
94
+ [
95
+ 1920,
96
+ 768
97
+ ],
98
+ [
99
+ 2304,
100
+ 384
101
+ ],
102
+ [
103
+ 2688,
104
+ 384
105
+ ],
106
+ [
107
+ 3072,
108
+ 384
109
+ ],
110
+ [
111
+ 3456,
112
+ 384
113
+ ],
114
+ [
115
+ 3840,
116
+ 384
117
+ ]
118
+ ],
119
+ "image_mean": [
120
+ 0.5,
121
+ 0.5,
122
+ 0.5
123
+ ],
124
+ "image_processor_type": "LlavaNextImageProcessor",
125
+ "image_std": [
126
+ 0.5,
127
+ 0.5,
128
+ 0.5
129
+ ],
130
+ "processor_class": "ColGraniteVisionProcessor",
131
+ "resample": 3,
132
+ "rescale_factor": 0.00392156862745098,
133
+ "size": {
134
+ "height": 384,
135
+ "width": 384
136
+ }
137
+ }
special_tokens_map.json CHANGED
@@ -1,35 +1,35 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|start_of_role|>",
4
- "<|end_of_role|>",
5
- "<|tool_call|>"
6
- ],
7
- "bos_token": {
8
- "content": "<|end_of_text|>",
9
- "lstrip": false,
10
- "normalized": false,
11
- "rstrip": false,
12
- "single_word": false
13
- },
14
- "eos_token": {
15
- "content": "<|end_of_text|>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "pad_token": {
22
- "content": "<|end_of_text|>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false
27
- },
28
- "unk_token": {
29
- "content": "<|end_of_text|>",
30
- "lstrip": false,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false
34
- }
35
- }
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|start_of_role|>",
4
+ "<|end_of_role|>",
5
+ "<|tool_call|>"
6
+ ],
7
+ "bos_token": {
8
+ "content": "<|end_of_text|>",
9
+ "lstrip": false,
10
+ "normalized": false,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "eos_token": {
15
+ "content": "<|end_of_text|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "pad_token": {
22
+ "content": "<|end_of_text|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "unk_token": {
29
+ "content": "<|end_of_text|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ }
35
+ }
tokenizer_config.json CHANGED
@@ -1,207 +1,208 @@
1
- {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "0": {
6
- "content": "<|end_of_text|>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "1": {
14
- "content": "<fim_prefix>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "2": {
22
- "content": "<fim_middle>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "3": {
30
- "content": "<fim_suffix>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "4": {
38
- "content": "<fim_pad>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "5": {
46
- "content": "<filename>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "6": {
54
- "content": "<gh_stars>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "7": {
62
- "content": "<issue_start>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "8": {
70
- "content": "<issue_comment>",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- },
77
- "9": {
78
- "content": "<issue_closed>",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": true
84
- },
85
- "10": {
86
- "content": "<jupyter_start>",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": true
92
- },
93
- "11": {
94
- "content": "<jupyter_text>",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": true
100
- },
101
- "12": {
102
- "content": "<jupyter_code>",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": true
108
- },
109
- "13": {
110
- "content": "<jupyter_output>",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": true
116
- },
117
- "14": {
118
- "content": "<empty_output>",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": true
124
- },
125
- "15": {
126
- "content": "<commit_before>",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": true
132
- },
133
- "16": {
134
- "content": "<commit_msg>",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": true
140
- },
141
- "17": {
142
- "content": "<commit_after>",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": true
148
- },
149
- "18": {
150
- "content": "<reponame>",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": true
156
- },
157
- "49152": {
158
- "content": "<|start_of_role|>",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": true
164
- },
165
- "49153": {
166
- "content": "<|end_of_role|>",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": true
172
- },
173
- "49154": {
174
- "content": "<|tool_call|>",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": true
180
- },
181
- "49155": {
182
- "content": "<image>",
183
- "lstrip": false,
184
- "normalized": false,
185
- "rstrip": false,
186
- "single_word": false,
187
- "special": true
188
- }
189
- },
190
- "additional_special_tokens": [
191
- "<|start_of_role|>",
192
- "<|end_of_role|>",
193
- "<|tool_call|>"
194
- ],
195
- "bos_token": "<|end_of_text|>",
196
- "chat_template": "{%- if tools %}\n {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n {%- for tool in tools %}\n {{- tool | tojson(indent=4) }}\n {%- if not loop.last %}\n {{- '\n\n' }}\n {%- endif %}\n {%- endfor %}\n {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages if message['role'] == 'system'%}{% else %}<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n{% endfor %}{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{- '<|system|>\n' + message['content'] + '\n' }}\n {%- elif message['role'] == 'user' %}\n {{- '<|user|>\n' + message['content'] + '\n' }}\n {%- elif message['role'] == 'assistant' %}\n {{- '<|assistant|>\n' + message['content'] + '<|end_of_text|>' }}\n {%- elif message['role'] == 'assistant_tool_call' %}\n {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- elif message['role'] == 'tool_response' %}\n {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- endif %}\n {%- if loop.last and add_generation_prompt %}\n {{- '<|assistant|>\n' }}\n {%- endif %}\n{%- endfor %}",
197
- "clean_up_tokenization_spaces": true,
198
- "eos_token": "<|end_of_text|>",
199
- "errors": "replace",
200
- "extra_special_tokens": {},
201
- "model_max_length": 16384,
202
- "pad_token": "<|end_of_text|>",
203
- "padding_side": "right",
204
- "tokenizer_class": "GPT2Tokenizer",
205
- "unk_token": "<|end_of_text|>",
206
- "vocab_size": 49152
 
207
  }
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<|end_of_text|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<fim_prefix>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<fim_middle>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<fim_suffix>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<fim_pad>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "5": {
46
+ "content": "<filename>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "6": {
54
+ "content": "<gh_stars>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "7": {
62
+ "content": "<issue_start>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "8": {
70
+ "content": "<issue_comment>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "9": {
78
+ "content": "<issue_closed>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "10": {
86
+ "content": "<jupyter_start>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "11": {
94
+ "content": "<jupyter_text>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "12": {
102
+ "content": "<jupyter_code>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "13": {
110
+ "content": "<jupyter_output>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "14": {
118
+ "content": "<empty_output>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "15": {
126
+ "content": "<commit_before>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "16": {
134
+ "content": "<commit_msg>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "17": {
142
+ "content": "<commit_after>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "18": {
150
+ "content": "<reponame>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "49152": {
158
+ "content": "<|start_of_role|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "49153": {
166
+ "content": "<|end_of_role|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "49154": {
174
+ "content": "<|tool_call|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "49155": {
182
+ "content": "<image>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ }
189
+ },
190
+ "additional_special_tokens": [
191
+ "<|start_of_role|>",
192
+ "<|end_of_role|>",
193
+ "<|tool_call|>"
194
+ ],
195
+ "bos_token": "<|end_of_text|>",
196
+ "chat_template": "{%- if tools %}\n {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n {%- for tool in tools %}\n {{- tool | tojson(indent=4) }}\n {%- if not loop.last %}\n {{- '\n\n' }}\n {%- endif %}\n {%- endfor %}\n {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages if message['role'] == 'system'%}{% else %}<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n{% endfor %}{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{- '<|system|>\n' + message['content'] + '\n' }}\n {%- elif message['role'] == 'user' %}\n {{- '<|user|>\n' + message['content'] + '\n' }}\n {%- elif message['role'] == 'assistant' %}\n {{- '<|assistant|>\n' + message['content'] + '<|end_of_text|>' }}\n {%- elif message['role'] == 'assistant_tool_call' %}\n {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- elif message['role'] == 'tool_response' %}\n {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- endif %}\n {%- if loop.last and add_generation_prompt %}\n {{- '<|assistant|>\n' }}\n {%- endif %}\n{%- endfor %}",
197
+ "clean_up_tokenization_spaces": true,
198
+ "do_image_splitting": false,
199
+ "eos_token": "<|end_of_text|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|end_of_text|>",
204
+ "padding_side": "right",
205
+ "tokenizer_class": "GPT2Tokenizer",
206
+ "unk_token": "<|end_of_text|>",
207
+ "vocab_size": 49152
208
  }