xiaohei66 commited on
Commit
a877c8c
·
verified ·
1 Parent(s): 466ad13

To rename for future compatibility with transformers (#71)

Browse files

- renmae (d4027c075d6b9b3acb6a7fe61fb65950069034a4)

config.json CHANGED
@@ -44,12 +44,12 @@
44
  "video_token_id": 101307,
45
  "vision_config": {
46
  "architectures": [
47
- "SiglipVisionModel"
48
  ],
49
  "attention_dropout": 0.0,
50
  "auto_map": {
51
  "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
52
- "AutoModel": "modeling_paddleocr_vl.SiglipVisionModel"
53
  },
54
  "hidden_act": "gelu_pytorch_tanh",
55
  "hidden_size": 1152,
 
44
  "video_token_id": 101307,
45
  "vision_config": {
46
  "architectures": [
47
+ "PaddleOCRVisionModel"
48
  ],
49
  "attention_dropout": 0.0,
50
  "auto_map": {
51
  "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
52
+ "AutoModel": "modeling_paddleocr_vl.PaddleOCRVisionModel"
53
  },
54
  "hidden_act": "gelu_pytorch_tanh",
55
  "hidden_size": 1152,
image_processing.py → image_processing_paddleocr_vl.py RENAMED
@@ -173,7 +173,7 @@ def smart_resize(
173
  return h_bar, w_bar
174
 
175
 
176
- class SiglipImageProcessor(BaseImageProcessor):
177
  r"""
178
  Constructs a Siglip image processor that dynamically resizes images based on the original images.
179
 
 
173
  return h_bar, w_bar
174
 
175
 
176
+ class PaddleOCRVLImageProcessor(BaseImageProcessor):
177
  r"""
178
  Constructs a Siglip image processor that dynamically resizes images based on the original images.
179
 
modeling_paddleocr_vl.py CHANGED
@@ -1033,7 +1033,7 @@ class Projector(nn.Module):
1033
  return hidden_states.view(*dims, -1)
1034
 
1035
 
1036
- class SiglipVisionEmbeddings(nn.Module):
1037
  def __init__(self, config: PaddleOCRVisionConfig):
1038
  super().__init__()
1039
  self.config = config
@@ -1217,7 +1217,7 @@ def eager_attention_forward(
1217
  return attn_output, attn_weights
1218
 
1219
 
1220
- class SiglipAttention(nn.Module):
1221
  """Multi-headed attention from 'Attention Is All You Need' paper"""
1222
 
1223
  def __init__(self, config: PaddleOCRVisionConfig):
@@ -1348,8 +1348,8 @@ class SiglipAttention(nn.Module):
1348
  return attn_output, attn_weights
1349
 
1350
 
1351
- # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
1352
- class SiglipMLP(nn.Module):
1353
  def __init__(self, config):
1354
  super().__init__()
1355
  self.config = config
@@ -1364,14 +1364,14 @@ class SiglipMLP(nn.Module):
1364
  return hidden_states
1365
 
1366
 
1367
- class SiglipEncoderLayer(nn.Module):
1368
  def __init__(self, config: PaddleOCRVisionConfig):
1369
  super().__init__()
1370
  self.embed_dim = config.hidden_size
1371
  self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
1372
- self.self_attn = SiglipAttention(config)
1373
  self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
1374
- self.mlp = SiglipMLP(config)
1375
 
1376
  def forward(
1377
  self,
@@ -1416,23 +1416,23 @@ class SiglipEncoderLayer(nn.Module):
1416
  return outputs
1417
 
1418
 
1419
- class SiglipPreTrainedModel(PreTrainedModel):
1420
  config_class = PaddleOCRVLConfig
1421
- base_model_prefix = "siglip"
1422
  supports_gradient_checkpointing = True
1423
 
1424
  _no_split_modules = [
1425
- "SiglipTextEmbeddings",
1426
- "SiglipEncoderLayer",
1427
- "SiglipVisionEmbeddings",
1428
- "SiglipMultiheadAttentionPoolingHead",
1429
  ]
1430
  _supports_flash_attn_2 = True
1431
  _supports_sdpa = True
1432
 
1433
  def _init_weights(self, module):
1434
  """Initialize the weights"""
1435
- if isinstance(module, SiglipVisionEmbeddings):
1436
  width = (
1437
  self.config.vision_config.hidden_size
1438
  if isinstance(self.config, PaddleOCRVLConfig)
@@ -1441,7 +1441,7 @@ class SiglipPreTrainedModel(PreTrainedModel):
1441
  nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
1442
  elif isinstance(module, nn.Embedding):
1443
  default_flax_embed_init(module.weight)
1444
- elif isinstance(module, SiglipAttention):
1445
  nn.init.xavier_uniform_(module.q_proj.weight)
1446
  nn.init.xavier_uniform_(module.k_proj.weight)
1447
  nn.init.xavier_uniform_(module.v_proj.weight)
@@ -1450,12 +1450,12 @@ class SiglipPreTrainedModel(PreTrainedModel):
1450
  nn.init.zeros_(module.k_proj.bias)
1451
  nn.init.zeros_(module.v_proj.bias)
1452
  nn.init.zeros_(module.out_proj.bias)
1453
- elif isinstance(module, SiglipMLP):
1454
  nn.init.xavier_uniform_(module.fc1.weight)
1455
  nn.init.xavier_uniform_(module.fc2.weight)
1456
  nn.init.normal_(module.fc1.bias, std=1e-6)
1457
  nn.init.normal_(module.fc2.bias, std=1e-6)
1458
- elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
1459
  nn.init.xavier_uniform_(module.probe.data)
1460
  nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
1461
  nn.init.zeros_(module.attention.in_proj_bias.data)
@@ -1468,11 +1468,11 @@ class SiglipPreTrainedModel(PreTrainedModel):
1468
  module.weight.data.fill_(1.0)
1469
 
1470
 
1471
- # Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->Siglip
1472
- class SiglipEncoder(nn.Module):
1473
  """
1474
  Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
1475
- [`SiglipEncoderLayer`].
1476
 
1477
  Args:
1478
  config: PaddleOCRVLConfig
@@ -1485,7 +1485,7 @@ class SiglipEncoder(nn.Module):
1485
  num_heads = config.num_attention_heads
1486
  head_dim = embed_dim // num_heads
1487
  self.layers = nn.ModuleList(
1488
- [SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)]
1489
  )
1490
  self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
1491
  self.gradient_checkpointing = False
@@ -1703,20 +1703,20 @@ class SiglipEncoder(nn.Module):
1703
  )
1704
 
1705
 
1706
- class SiglipVisionTransformer(nn.Module):
1707
  def __init__(self, config: PaddleOCRVisionConfig):
1708
  super().__init__()
1709
  self.config = config
1710
  embed_dim = config.hidden_size
1711
 
1712
- self.embeddings = SiglipVisionEmbeddings(config)
1713
- self.encoder = SiglipEncoder(config)
1714
  self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
1715
  self.use_head = (
1716
  True if not hasattr(config, "vision_use_head") else config.vision_use_head
1717
  )
1718
  if self.use_head:
1719
- self.head = SiglipMultiheadAttentionPoolingHead(config)
1720
 
1721
  # @can_return_tuple
1722
  def forward(
@@ -1861,7 +1861,7 @@ class SiglipVisionTransformer(nn.Module):
1861
  )
1862
 
1863
 
1864
- class SiglipMultiheadAttentionPoolingHead(nn.Module):
1865
  """Multihead Attention Pooling."""
1866
 
1867
  def __init__(self, config: PaddleOCRVisionConfig):
@@ -1872,7 +1872,7 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
1872
  config.hidden_size, config.num_attention_heads, batch_first=True
1873
  )
1874
  self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
1875
- self.mlp = SiglipMLP(config)
1876
 
1877
  def forward(self, hidden_state, key_padding_mask=None):
1878
  batch_size = hidden_state.shape[0]
@@ -1889,14 +1889,14 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
1889
  return hidden_state[:, 0]
1890
 
1891
 
1892
- class SiglipVisionModel(SiglipPreTrainedModel):
1893
  config_class = PaddleOCRVisionConfig
1894
  main_input_name = "pixel_values"
1895
 
1896
  def __init__(self, config: PaddleOCRVisionConfig):
1897
  super().__init__(config)
1898
 
1899
- self.vision_model = SiglipVisionTransformer(config)
1900
 
1901
  # Initialize weights and apply final processing
1902
  self.post_init()
@@ -1922,29 +1922,6 @@ class SiglipVisionModel(SiglipPreTrainedModel):
1922
  use_rope: Optional[bool] = False,
1923
  window_size: Optional[bool] = -1,
1924
  ) -> BaseModelOutputWithPooling:
1925
- r"""
1926
- Returns:
1927
-
1928
- Examples:
1929
-
1930
- ```python
1931
- >>> from PIL import Image
1932
- >>> import requests
1933
- >>> from transformers import AutoProcessor, SiglipVisionModel
1934
-
1935
- >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
1936
- >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
1937
-
1938
- >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1939
- >>> image = Image.open(requests.get(url, stream=True).raw)
1940
-
1941
- >>> inputs = processor(images=image, return_tensors="pt")
1942
-
1943
- >>> outputs = model(**inputs)
1944
- >>> last_hidden_state = outputs.last_hidden_state
1945
- >>> pooled_output = outputs.pooler_output # pooled features
1946
- ```"""
1947
-
1948
  return self.vision_model(
1949
  pixel_values=pixel_values,
1950
  output_attentions=output_attentions,
@@ -2055,12 +2032,12 @@ class PaddleOCRVLCausalLMOutputWithPast(ModelOutput):
2055
  class PaddleOCRVLForConditionalGeneration(Ernie4_5PreTrainedModel, GenerationMixin):
2056
  _tied_weights_keys = ["lm_head.weight"]
2057
  config_class = PaddleOCRVLConfig
2058
- _no_split_modules = ["Ernie4_5_DecoderLayer", "SiglipEncoderLayer"]
2059
 
2060
  def __init__(self, config):
2061
  super().__init__(config)
2062
  self.mlp_AR = Projector(config, config.vision_config)
2063
- self.visual = SiglipVisionModel(config.vision_config)
2064
  self.model = Ernie4_5Model(config)
2065
  self.vocab_size = config.vocab_size
2066
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 
1033
  return hidden_states.view(*dims, -1)
1034
 
1035
 
1036
+ class PaddleOCRVisionEmbeddings(nn.Module):
1037
  def __init__(self, config: PaddleOCRVisionConfig):
1038
  super().__init__()
1039
  self.config = config
 
1217
  return attn_output, attn_weights
1218
 
1219
 
1220
+ class PaddleOCRAttention(nn.Module):
1221
  """Multi-headed attention from 'Attention Is All You Need' paper"""
1222
 
1223
  def __init__(self, config: PaddleOCRVisionConfig):
 
1348
  return attn_output, attn_weights
1349
 
1350
 
1351
+ # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->PaddleOCR
1352
+ class PaddleOCRMLP(nn.Module):
1353
  def __init__(self, config):
1354
  super().__init__()
1355
  self.config = config
 
1364
  return hidden_states
1365
 
1366
 
1367
+ class PaddleOCREncoderLayer(nn.Module):
1368
  def __init__(self, config: PaddleOCRVisionConfig):
1369
  super().__init__()
1370
  self.embed_dim = config.hidden_size
1371
  self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
1372
+ self.self_attn = PaddleOCRAttention(config)
1373
  self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
1374
+ self.mlp = PaddleOCRMLP(config)
1375
 
1376
  def forward(
1377
  self,
 
1416
  return outputs
1417
 
1418
 
1419
+ class PaddleOCRPreTrainedModel(PreTrainedModel):
1420
  config_class = PaddleOCRVLConfig
1421
+ base_model_prefix = "PaddleOCR"
1422
  supports_gradient_checkpointing = True
1423
 
1424
  _no_split_modules = [
1425
+ "PaddleOCRTextEmbeddings",
1426
+ "PaddleOCREncoderLayer",
1427
+ "PaddleOCRVisionEmbeddings",
1428
+ "PaddleOCRMultiheadAttentionPoolingHead",
1429
  ]
1430
  _supports_flash_attn_2 = True
1431
  _supports_sdpa = True
1432
 
1433
  def _init_weights(self, module):
1434
  """Initialize the weights"""
1435
+ if isinstance(module, PaddleOCRVisionEmbeddings):
1436
  width = (
1437
  self.config.vision_config.hidden_size
1438
  if isinstance(self.config, PaddleOCRVLConfig)
 
1441
  nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
1442
  elif isinstance(module, nn.Embedding):
1443
  default_flax_embed_init(module.weight)
1444
+ elif isinstance(module, PaddleOCRAttention):
1445
  nn.init.xavier_uniform_(module.q_proj.weight)
1446
  nn.init.xavier_uniform_(module.k_proj.weight)
1447
  nn.init.xavier_uniform_(module.v_proj.weight)
 
1450
  nn.init.zeros_(module.k_proj.bias)
1451
  nn.init.zeros_(module.v_proj.bias)
1452
  nn.init.zeros_(module.out_proj.bias)
1453
+ elif isinstance(module, PaddleOCRMLP):
1454
  nn.init.xavier_uniform_(module.fc1.weight)
1455
  nn.init.xavier_uniform_(module.fc2.weight)
1456
  nn.init.normal_(module.fc1.bias, std=1e-6)
1457
  nn.init.normal_(module.fc2.bias, std=1e-6)
1458
+ elif isinstance(module, PaddleOCRMultiheadAttentionPoolingHead):
1459
  nn.init.xavier_uniform_(module.probe.data)
1460
  nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
1461
  nn.init.zeros_(module.attention.in_proj_bias.data)
 
1468
  module.weight.data.fill_(1.0)
1469
 
1470
 
1471
+ # Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->PaddleOCR
1472
+ class PaddleOCREncoder(nn.Module):
1473
  """
1474
  Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
1475
+ [`PaddleOCREncoderLayer`].
1476
 
1477
  Args:
1478
  config: PaddleOCRVLConfig
 
1485
  num_heads = config.num_attention_heads
1486
  head_dim = embed_dim // num_heads
1487
  self.layers = nn.ModuleList(
1488
+ [PaddleOCREncoderLayer(config) for _ in range(config.num_hidden_layers)]
1489
  )
1490
  self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
1491
  self.gradient_checkpointing = False
 
1703
  )
1704
 
1705
 
1706
+ class PaddleOCRVisionTransformer(nn.Module):
1707
  def __init__(self, config: PaddleOCRVisionConfig):
1708
  super().__init__()
1709
  self.config = config
1710
  embed_dim = config.hidden_size
1711
 
1712
+ self.embeddings = PaddleOCRVisionEmbeddings(config)
1713
+ self.encoder = PaddleOCREncoder(config)
1714
  self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
1715
  self.use_head = (
1716
  True if not hasattr(config, "vision_use_head") else config.vision_use_head
1717
  )
1718
  if self.use_head:
1719
+ self.head = PaddleOCRMultiheadAttentionPoolingHead(config)
1720
 
1721
  # @can_return_tuple
1722
  def forward(
 
1861
  )
1862
 
1863
 
1864
+ class PaddleOCRMultiheadAttentionPoolingHead(nn.Module):
1865
  """Multihead Attention Pooling."""
1866
 
1867
  def __init__(self, config: PaddleOCRVisionConfig):
 
1872
  config.hidden_size, config.num_attention_heads, batch_first=True
1873
  )
1874
  self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
1875
+ self.mlp = PaddleOCRMLP(config)
1876
 
1877
  def forward(self, hidden_state, key_padding_mask=None):
1878
  batch_size = hidden_state.shape[0]
 
1889
  return hidden_state[:, 0]
1890
 
1891
 
1892
+ class PaddleOCRVisionModel(PaddleOCRPreTrainedModel):
1893
  config_class = PaddleOCRVisionConfig
1894
  main_input_name = "pixel_values"
1895
 
1896
  def __init__(self, config: PaddleOCRVisionConfig):
1897
  super().__init__(config)
1898
 
1899
+ self.vision_model = PaddleOCRVisionTransformer(config)
1900
 
1901
  # Initialize weights and apply final processing
1902
  self.post_init()
 
1922
  use_rope: Optional[bool] = False,
1923
  window_size: Optional[bool] = -1,
1924
  ) -> BaseModelOutputWithPooling:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1925
  return self.vision_model(
1926
  pixel_values=pixel_values,
1927
  output_attentions=output_attentions,
 
2032
  class PaddleOCRVLForConditionalGeneration(Ernie4_5PreTrainedModel, GenerationMixin):
2033
  _tied_weights_keys = ["lm_head.weight"]
2034
  config_class = PaddleOCRVLConfig
2035
+ _no_split_modules = ["Ernie4_5_DecoderLayer", "PaddleOCREncoderLayer"]
2036
 
2037
  def __init__(self, config):
2038
  super().__init__(config)
2039
  self.mlp_AR = Projector(config, config.vision_config)
2040
+ self.visual = PaddleOCRVisionModel(config.vision_config)
2041
  self.model = Ernie4_5Model(config)
2042
  self.vocab_size = config.vocab_size
2043
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
preprocessor_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "auto_map": {
3
- "AutoImageProcessor": "image_processing.SiglipImageProcessor",
4
  "AutoProcessor": "processing_paddleocr_vl.PaddleOCRVLProcessor"
5
  },
6
  "do_convert_rgb": true,
@@ -12,7 +12,7 @@
12
  0.5,
13
  0.5
14
  ],
15
- "image_processor_type": "SiglipImageProcessor",
16
  "image_std": [
17
  0.5,
18
  0.5,
@@ -25,9 +25,5 @@
25
  "processor_class": "PaddleOCRVLProcessor",
26
  "resample": 3,
27
  "rescale_factor": 0.00392156862745098,
28
- "size": {
29
- "max_pixels": 2822400,
30
- "min_pixels": 147384
31
- },
32
  "temporal_patch_size": 1
33
  }
 
1
  {
2
  "auto_map": {
3
+ "AutoImageProcessor": "image_processing_paddleocr_vl.PaddleOCRVLImageProcessor",
4
  "AutoProcessor": "processing_paddleocr_vl.PaddleOCRVLProcessor"
5
  },
6
  "do_convert_rgb": true,
 
12
  0.5,
13
  0.5
14
  ],
15
+ "image_processor_type": "PaddleOCRVLImageProcessor",
16
  "image_std": [
17
  0.5,
18
  0.5,
 
25
  "processor_class": "PaddleOCRVLProcessor",
26
  "resample": 3,
27
  "rescale_factor": 0.00392156862745098,
 
 
 
 
28
  "temporal_patch_size": 1
29
  }