To rename for future compatibility with transformers (#71)
Browse files- renmae (d4027c075d6b9b3acb6a7fe61fb65950069034a4)
- config.json +2 -2
- image_processing.py → image_processing_paddleocr_vl.py +1 -1
- modeling_paddleocr_vl.py +31 -54
- preprocessor_config.json +2 -6
config.json
CHANGED
|
@@ -44,12 +44,12 @@
|
|
| 44 |
"video_token_id": 101307,
|
| 45 |
"vision_config": {
|
| 46 |
"architectures": [
|
| 47 |
-
"
|
| 48 |
],
|
| 49 |
"attention_dropout": 0.0,
|
| 50 |
"auto_map": {
|
| 51 |
"AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
|
| 52 |
-
"AutoModel": "modeling_paddleocr_vl.
|
| 53 |
},
|
| 54 |
"hidden_act": "gelu_pytorch_tanh",
|
| 55 |
"hidden_size": 1152,
|
|
|
|
| 44 |
"video_token_id": 101307,
|
| 45 |
"vision_config": {
|
| 46 |
"architectures": [
|
| 47 |
+
"PaddleOCRVisionModel"
|
| 48 |
],
|
| 49 |
"attention_dropout": 0.0,
|
| 50 |
"auto_map": {
|
| 51 |
"AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
|
| 52 |
+
"AutoModel": "modeling_paddleocr_vl.PaddleOCRVisionModel"
|
| 53 |
},
|
| 54 |
"hidden_act": "gelu_pytorch_tanh",
|
| 55 |
"hidden_size": 1152,
|
image_processing.py → image_processing_paddleocr_vl.py
RENAMED
|
@@ -173,7 +173,7 @@ def smart_resize(
|
|
| 173 |
return h_bar, w_bar
|
| 174 |
|
| 175 |
|
| 176 |
-
class
|
| 177 |
r"""
|
| 178 |
Constructs a Siglip image processor that dynamically resizes images based on the original images.
|
| 179 |
|
|
|
|
| 173 |
return h_bar, w_bar
|
| 174 |
|
| 175 |
|
| 176 |
+
class PaddleOCRVLImageProcessor(BaseImageProcessor):
|
| 177 |
r"""
|
| 178 |
Constructs a Siglip image processor that dynamically resizes images based on the original images.
|
| 179 |
|
modeling_paddleocr_vl.py
CHANGED
|
@@ -1033,7 +1033,7 @@ class Projector(nn.Module):
|
|
| 1033 |
return hidden_states.view(*dims, -1)
|
| 1034 |
|
| 1035 |
|
| 1036 |
-
class
|
| 1037 |
def __init__(self, config: PaddleOCRVisionConfig):
|
| 1038 |
super().__init__()
|
| 1039 |
self.config = config
|
|
@@ -1217,7 +1217,7 @@ def eager_attention_forward(
|
|
| 1217 |
return attn_output, attn_weights
|
| 1218 |
|
| 1219 |
|
| 1220 |
-
class
|
| 1221 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
| 1222 |
|
| 1223 |
def __init__(self, config: PaddleOCRVisionConfig):
|
|
@@ -1348,8 +1348,8 @@ class SiglipAttention(nn.Module):
|
|
| 1348 |
return attn_output, attn_weights
|
| 1349 |
|
| 1350 |
|
| 1351 |
-
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->
|
| 1352 |
-
class
|
| 1353 |
def __init__(self, config):
|
| 1354 |
super().__init__()
|
| 1355 |
self.config = config
|
|
@@ -1364,14 +1364,14 @@ class SiglipMLP(nn.Module):
|
|
| 1364 |
return hidden_states
|
| 1365 |
|
| 1366 |
|
| 1367 |
-
class
|
| 1368 |
def __init__(self, config: PaddleOCRVisionConfig):
|
| 1369 |
super().__init__()
|
| 1370 |
self.embed_dim = config.hidden_size
|
| 1371 |
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
| 1372 |
-
self.self_attn =
|
| 1373 |
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
| 1374 |
-
self.mlp =
|
| 1375 |
|
| 1376 |
def forward(
|
| 1377 |
self,
|
|
@@ -1416,23 +1416,23 @@ class SiglipEncoderLayer(nn.Module):
|
|
| 1416 |
return outputs
|
| 1417 |
|
| 1418 |
|
| 1419 |
-
class
|
| 1420 |
config_class = PaddleOCRVLConfig
|
| 1421 |
-
base_model_prefix = "
|
| 1422 |
supports_gradient_checkpointing = True
|
| 1423 |
|
| 1424 |
_no_split_modules = [
|
| 1425 |
-
"
|
| 1426 |
-
"
|
| 1427 |
-
"
|
| 1428 |
-
"
|
| 1429 |
]
|
| 1430 |
_supports_flash_attn_2 = True
|
| 1431 |
_supports_sdpa = True
|
| 1432 |
|
| 1433 |
def _init_weights(self, module):
|
| 1434 |
"""Initialize the weights"""
|
| 1435 |
-
if isinstance(module,
|
| 1436 |
width = (
|
| 1437 |
self.config.vision_config.hidden_size
|
| 1438 |
if isinstance(self.config, PaddleOCRVLConfig)
|
|
@@ -1441,7 +1441,7 @@ class SiglipPreTrainedModel(PreTrainedModel):
|
|
| 1441 |
nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
|
| 1442 |
elif isinstance(module, nn.Embedding):
|
| 1443 |
default_flax_embed_init(module.weight)
|
| 1444 |
-
elif isinstance(module,
|
| 1445 |
nn.init.xavier_uniform_(module.q_proj.weight)
|
| 1446 |
nn.init.xavier_uniform_(module.k_proj.weight)
|
| 1447 |
nn.init.xavier_uniform_(module.v_proj.weight)
|
|
@@ -1450,12 +1450,12 @@ class SiglipPreTrainedModel(PreTrainedModel):
|
|
| 1450 |
nn.init.zeros_(module.k_proj.bias)
|
| 1451 |
nn.init.zeros_(module.v_proj.bias)
|
| 1452 |
nn.init.zeros_(module.out_proj.bias)
|
| 1453 |
-
elif isinstance(module,
|
| 1454 |
nn.init.xavier_uniform_(module.fc1.weight)
|
| 1455 |
nn.init.xavier_uniform_(module.fc2.weight)
|
| 1456 |
nn.init.normal_(module.fc1.bias, std=1e-6)
|
| 1457 |
nn.init.normal_(module.fc2.bias, std=1e-6)
|
| 1458 |
-
elif isinstance(module,
|
| 1459 |
nn.init.xavier_uniform_(module.probe.data)
|
| 1460 |
nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
|
| 1461 |
nn.init.zeros_(module.attention.in_proj_bias.data)
|
|
@@ -1468,11 +1468,11 @@ class SiglipPreTrainedModel(PreTrainedModel):
|
|
| 1468 |
module.weight.data.fill_(1.0)
|
| 1469 |
|
| 1470 |
|
| 1471 |
-
# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->
|
| 1472 |
-
class
|
| 1473 |
"""
|
| 1474 |
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
|
| 1475 |
-
[`
|
| 1476 |
|
| 1477 |
Args:
|
| 1478 |
config: PaddleOCRVLConfig
|
|
@@ -1485,7 +1485,7 @@ class SiglipEncoder(nn.Module):
|
|
| 1485 |
num_heads = config.num_attention_heads
|
| 1486 |
head_dim = embed_dim // num_heads
|
| 1487 |
self.layers = nn.ModuleList(
|
| 1488 |
-
[
|
| 1489 |
)
|
| 1490 |
self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
|
| 1491 |
self.gradient_checkpointing = False
|
|
@@ -1703,20 +1703,20 @@ class SiglipEncoder(nn.Module):
|
|
| 1703 |
)
|
| 1704 |
|
| 1705 |
|
| 1706 |
-
class
|
| 1707 |
def __init__(self, config: PaddleOCRVisionConfig):
|
| 1708 |
super().__init__()
|
| 1709 |
self.config = config
|
| 1710 |
embed_dim = config.hidden_size
|
| 1711 |
|
| 1712 |
-
self.embeddings =
|
| 1713 |
-
self.encoder =
|
| 1714 |
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
| 1715 |
self.use_head = (
|
| 1716 |
True if not hasattr(config, "vision_use_head") else config.vision_use_head
|
| 1717 |
)
|
| 1718 |
if self.use_head:
|
| 1719 |
-
self.head =
|
| 1720 |
|
| 1721 |
# @can_return_tuple
|
| 1722 |
def forward(
|
|
@@ -1861,7 +1861,7 @@ class SiglipVisionTransformer(nn.Module):
|
|
| 1861 |
)
|
| 1862 |
|
| 1863 |
|
| 1864 |
-
class
|
| 1865 |
"""Multihead Attention Pooling."""
|
| 1866 |
|
| 1867 |
def __init__(self, config: PaddleOCRVisionConfig):
|
|
@@ -1872,7 +1872,7 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
|
|
| 1872 |
config.hidden_size, config.num_attention_heads, batch_first=True
|
| 1873 |
)
|
| 1874 |
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
| 1875 |
-
self.mlp =
|
| 1876 |
|
| 1877 |
def forward(self, hidden_state, key_padding_mask=None):
|
| 1878 |
batch_size = hidden_state.shape[0]
|
|
@@ -1889,14 +1889,14 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
|
|
| 1889 |
return hidden_state[:, 0]
|
| 1890 |
|
| 1891 |
|
| 1892 |
-
class
|
| 1893 |
config_class = PaddleOCRVisionConfig
|
| 1894 |
main_input_name = "pixel_values"
|
| 1895 |
|
| 1896 |
def __init__(self, config: PaddleOCRVisionConfig):
|
| 1897 |
super().__init__(config)
|
| 1898 |
|
| 1899 |
-
self.vision_model =
|
| 1900 |
|
| 1901 |
# Initialize weights and apply final processing
|
| 1902 |
self.post_init()
|
|
@@ -1922,29 +1922,6 @@ class SiglipVisionModel(SiglipPreTrainedModel):
|
|
| 1922 |
use_rope: Optional[bool] = False,
|
| 1923 |
window_size: Optional[bool] = -1,
|
| 1924 |
) -> BaseModelOutputWithPooling:
|
| 1925 |
-
r"""
|
| 1926 |
-
Returns:
|
| 1927 |
-
|
| 1928 |
-
Examples:
|
| 1929 |
-
|
| 1930 |
-
```python
|
| 1931 |
-
>>> from PIL import Image
|
| 1932 |
-
>>> import requests
|
| 1933 |
-
>>> from transformers import AutoProcessor, SiglipVisionModel
|
| 1934 |
-
|
| 1935 |
-
>>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
|
| 1936 |
-
>>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
|
| 1937 |
-
|
| 1938 |
-
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 1939 |
-
>>> image = Image.open(requests.get(url, stream=True).raw)
|
| 1940 |
-
|
| 1941 |
-
>>> inputs = processor(images=image, return_tensors="pt")
|
| 1942 |
-
|
| 1943 |
-
>>> outputs = model(**inputs)
|
| 1944 |
-
>>> last_hidden_state = outputs.last_hidden_state
|
| 1945 |
-
>>> pooled_output = outputs.pooler_output # pooled features
|
| 1946 |
-
```"""
|
| 1947 |
-
|
| 1948 |
return self.vision_model(
|
| 1949 |
pixel_values=pixel_values,
|
| 1950 |
output_attentions=output_attentions,
|
|
@@ -2055,12 +2032,12 @@ class PaddleOCRVLCausalLMOutputWithPast(ModelOutput):
|
|
| 2055 |
class PaddleOCRVLForConditionalGeneration(Ernie4_5PreTrainedModel, GenerationMixin):
|
| 2056 |
_tied_weights_keys = ["lm_head.weight"]
|
| 2057 |
config_class = PaddleOCRVLConfig
|
| 2058 |
-
_no_split_modules = ["Ernie4_5_DecoderLayer", "
|
| 2059 |
|
| 2060 |
def __init__(self, config):
|
| 2061 |
super().__init__(config)
|
| 2062 |
self.mlp_AR = Projector(config, config.vision_config)
|
| 2063 |
-
self.visual =
|
| 2064 |
self.model = Ernie4_5Model(config)
|
| 2065 |
self.vocab_size = config.vocab_size
|
| 2066 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
|
|
|
| 1033 |
return hidden_states.view(*dims, -1)
|
| 1034 |
|
| 1035 |
|
| 1036 |
+
class PaddleOCRVisionEmbeddings(nn.Module):
|
| 1037 |
def __init__(self, config: PaddleOCRVisionConfig):
|
| 1038 |
super().__init__()
|
| 1039 |
self.config = config
|
|
|
|
| 1217 |
return attn_output, attn_weights
|
| 1218 |
|
| 1219 |
|
| 1220 |
+
class PaddleOCRAttention(nn.Module):
|
| 1221 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
| 1222 |
|
| 1223 |
def __init__(self, config: PaddleOCRVisionConfig):
|
|
|
|
| 1348 |
return attn_output, attn_weights
|
| 1349 |
|
| 1350 |
|
| 1351 |
+
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->PaddleOCR
|
| 1352 |
+
class PaddleOCRMLP(nn.Module):
|
| 1353 |
def __init__(self, config):
|
| 1354 |
super().__init__()
|
| 1355 |
self.config = config
|
|
|
|
| 1364 |
return hidden_states
|
| 1365 |
|
| 1366 |
|
| 1367 |
+
class PaddleOCREncoderLayer(nn.Module):
|
| 1368 |
def __init__(self, config: PaddleOCRVisionConfig):
|
| 1369 |
super().__init__()
|
| 1370 |
self.embed_dim = config.hidden_size
|
| 1371 |
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
| 1372 |
+
self.self_attn = PaddleOCRAttention(config)
|
| 1373 |
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
| 1374 |
+
self.mlp = PaddleOCRMLP(config)
|
| 1375 |
|
| 1376 |
def forward(
|
| 1377 |
self,
|
|
|
|
| 1416 |
return outputs
|
| 1417 |
|
| 1418 |
|
| 1419 |
+
class PaddleOCRPreTrainedModel(PreTrainedModel):
|
| 1420 |
config_class = PaddleOCRVLConfig
|
| 1421 |
+
base_model_prefix = "PaddleOCR"
|
| 1422 |
supports_gradient_checkpointing = True
|
| 1423 |
|
| 1424 |
_no_split_modules = [
|
| 1425 |
+
"PaddleOCRTextEmbeddings",
|
| 1426 |
+
"PaddleOCREncoderLayer",
|
| 1427 |
+
"PaddleOCRVisionEmbeddings",
|
| 1428 |
+
"PaddleOCRMultiheadAttentionPoolingHead",
|
| 1429 |
]
|
| 1430 |
_supports_flash_attn_2 = True
|
| 1431 |
_supports_sdpa = True
|
| 1432 |
|
| 1433 |
def _init_weights(self, module):
|
| 1434 |
"""Initialize the weights"""
|
| 1435 |
+
if isinstance(module, PaddleOCRVisionEmbeddings):
|
| 1436 |
width = (
|
| 1437 |
self.config.vision_config.hidden_size
|
| 1438 |
if isinstance(self.config, PaddleOCRVLConfig)
|
|
|
|
| 1441 |
nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
|
| 1442 |
elif isinstance(module, nn.Embedding):
|
| 1443 |
default_flax_embed_init(module.weight)
|
| 1444 |
+
elif isinstance(module, PaddleOCRAttention):
|
| 1445 |
nn.init.xavier_uniform_(module.q_proj.weight)
|
| 1446 |
nn.init.xavier_uniform_(module.k_proj.weight)
|
| 1447 |
nn.init.xavier_uniform_(module.v_proj.weight)
|
|
|
|
| 1450 |
nn.init.zeros_(module.k_proj.bias)
|
| 1451 |
nn.init.zeros_(module.v_proj.bias)
|
| 1452 |
nn.init.zeros_(module.out_proj.bias)
|
| 1453 |
+
elif isinstance(module, PaddleOCRMLP):
|
| 1454 |
nn.init.xavier_uniform_(module.fc1.weight)
|
| 1455 |
nn.init.xavier_uniform_(module.fc2.weight)
|
| 1456 |
nn.init.normal_(module.fc1.bias, std=1e-6)
|
| 1457 |
nn.init.normal_(module.fc2.bias, std=1e-6)
|
| 1458 |
+
elif isinstance(module, PaddleOCRMultiheadAttentionPoolingHead):
|
| 1459 |
nn.init.xavier_uniform_(module.probe.data)
|
| 1460 |
nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
|
| 1461 |
nn.init.zeros_(module.attention.in_proj_bias.data)
|
|
|
|
| 1468 |
module.weight.data.fill_(1.0)
|
| 1469 |
|
| 1470 |
|
| 1471 |
+
# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->PaddleOCR
|
| 1472 |
+
class PaddleOCREncoder(nn.Module):
|
| 1473 |
"""
|
| 1474 |
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
|
| 1475 |
+
[`PaddleOCREncoderLayer`].
|
| 1476 |
|
| 1477 |
Args:
|
| 1478 |
config: PaddleOCRVLConfig
|
|
|
|
| 1485 |
num_heads = config.num_attention_heads
|
| 1486 |
head_dim = embed_dim // num_heads
|
| 1487 |
self.layers = nn.ModuleList(
|
| 1488 |
+
[PaddleOCREncoderLayer(config) for _ in range(config.num_hidden_layers)]
|
| 1489 |
)
|
| 1490 |
self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
|
| 1491 |
self.gradient_checkpointing = False
|
|
|
|
| 1703 |
)
|
| 1704 |
|
| 1705 |
|
| 1706 |
+
class PaddleOCRVisionTransformer(nn.Module):
|
| 1707 |
def __init__(self, config: PaddleOCRVisionConfig):
|
| 1708 |
super().__init__()
|
| 1709 |
self.config = config
|
| 1710 |
embed_dim = config.hidden_size
|
| 1711 |
|
| 1712 |
+
self.embeddings = PaddleOCRVisionEmbeddings(config)
|
| 1713 |
+
self.encoder = PaddleOCREncoder(config)
|
| 1714 |
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
| 1715 |
self.use_head = (
|
| 1716 |
True if not hasattr(config, "vision_use_head") else config.vision_use_head
|
| 1717 |
)
|
| 1718 |
if self.use_head:
|
| 1719 |
+
self.head = PaddleOCRMultiheadAttentionPoolingHead(config)
|
| 1720 |
|
| 1721 |
# @can_return_tuple
|
| 1722 |
def forward(
|
|
|
|
| 1861 |
)
|
| 1862 |
|
| 1863 |
|
| 1864 |
+
class PaddleOCRMultiheadAttentionPoolingHead(nn.Module):
|
| 1865 |
"""Multihead Attention Pooling."""
|
| 1866 |
|
| 1867 |
def __init__(self, config: PaddleOCRVisionConfig):
|
|
|
|
| 1872 |
config.hidden_size, config.num_attention_heads, batch_first=True
|
| 1873 |
)
|
| 1874 |
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
| 1875 |
+
self.mlp = PaddleOCRMLP(config)
|
| 1876 |
|
| 1877 |
def forward(self, hidden_state, key_padding_mask=None):
|
| 1878 |
batch_size = hidden_state.shape[0]
|
|
|
|
| 1889 |
return hidden_state[:, 0]
|
| 1890 |
|
| 1891 |
|
| 1892 |
+
class PaddleOCRVisionModel(PaddleOCRPreTrainedModel):
|
| 1893 |
config_class = PaddleOCRVisionConfig
|
| 1894 |
main_input_name = "pixel_values"
|
| 1895 |
|
| 1896 |
def __init__(self, config: PaddleOCRVisionConfig):
|
| 1897 |
super().__init__(config)
|
| 1898 |
|
| 1899 |
+
self.vision_model = PaddleOCRVisionTransformer(config)
|
| 1900 |
|
| 1901 |
# Initialize weights and apply final processing
|
| 1902 |
self.post_init()
|
|
|
|
| 1922 |
use_rope: Optional[bool] = False,
|
| 1923 |
window_size: Optional[bool] = -1,
|
| 1924 |
) -> BaseModelOutputWithPooling:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1925 |
return self.vision_model(
|
| 1926 |
pixel_values=pixel_values,
|
| 1927 |
output_attentions=output_attentions,
|
|
|
|
| 2032 |
class PaddleOCRVLForConditionalGeneration(Ernie4_5PreTrainedModel, GenerationMixin):
|
| 2033 |
_tied_weights_keys = ["lm_head.weight"]
|
| 2034 |
config_class = PaddleOCRVLConfig
|
| 2035 |
+
_no_split_modules = ["Ernie4_5_DecoderLayer", "PaddleOCREncoderLayer"]
|
| 2036 |
|
| 2037 |
def __init__(self, config):
|
| 2038 |
super().__init__(config)
|
| 2039 |
self.mlp_AR = Projector(config, config.vision_config)
|
| 2040 |
+
self.visual = PaddleOCRVisionModel(config.vision_config)
|
| 2041 |
self.model = Ernie4_5Model(config)
|
| 2042 |
self.vocab_size = config.vocab_size
|
| 2043 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
preprocessor_config.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"auto_map": {
|
| 3 |
-
"AutoImageProcessor": "
|
| 4 |
"AutoProcessor": "processing_paddleocr_vl.PaddleOCRVLProcessor"
|
| 5 |
},
|
| 6 |
"do_convert_rgb": true,
|
|
@@ -12,7 +12,7 @@
|
|
| 12 |
0.5,
|
| 13 |
0.5
|
| 14 |
],
|
| 15 |
-
"image_processor_type": "
|
| 16 |
"image_std": [
|
| 17 |
0.5,
|
| 18 |
0.5,
|
|
@@ -25,9 +25,5 @@
|
|
| 25 |
"processor_class": "PaddleOCRVLProcessor",
|
| 26 |
"resample": 3,
|
| 27 |
"rescale_factor": 0.00392156862745098,
|
| 28 |
-
"size": {
|
| 29 |
-
"max_pixels": 2822400,
|
| 30 |
-
"min_pixels": 147384
|
| 31 |
-
},
|
| 32 |
"temporal_patch_size": 1
|
| 33 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"auto_map": {
|
| 3 |
+
"AutoImageProcessor": "image_processing_paddleocr_vl.PaddleOCRVLImageProcessor",
|
| 4 |
"AutoProcessor": "processing_paddleocr_vl.PaddleOCRVLProcessor"
|
| 5 |
},
|
| 6 |
"do_convert_rgb": true,
|
|
|
|
| 12 |
0.5,
|
| 13 |
0.5
|
| 14 |
],
|
| 15 |
+
"image_processor_type": "PaddleOCRVLImageProcessor",
|
| 16 |
"image_std": [
|
| 17 |
0.5,
|
| 18 |
0.5,
|
|
|
|
| 25 |
"processor_class": "PaddleOCRVLProcessor",
|
| 26 |
"resample": 3,
|
| 27 |
"rescale_factor": 0.00392156862745098,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
"temporal_patch_size": 1
|
| 29 |
}
|