fantos Tingquan commited on
Commit
b1902a6
·
verified ·
0 Parent(s):

Duplicate from PaddlePaddle/PaddleOCR-VL

Browse files

Co-authored-by: Tingquan Gao <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ PP-DocLayoutV2/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
37
+ PP-DocLayoutV2/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
PP-DocLayoutV2/config.json ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mode": "paddle",
3
+ "draw_threshold": 0.5,
4
+ "metric": "COCO",
5
+ "use_dynamic_shape": false,
6
+ "Global": {
7
+ "model_name": "PP-DocLayoutV2"
8
+ },
9
+ "arch": "DETR",
10
+ "min_subgraph_size": 3,
11
+ "Preprocess": [
12
+ {
13
+ "interp": 2,
14
+ "keep_ratio": false,
15
+ "target_size": [
16
+ 800,
17
+ 800
18
+ ],
19
+ "type": "Resize"
20
+ },
21
+ {
22
+ "mean": [
23
+ 0.0,
24
+ 0.0,
25
+ 0.0
26
+ ],
27
+ "norm_type": "none",
28
+ "std": [
29
+ 1.0,
30
+ 1.0,
31
+ 1.0
32
+ ],
33
+ "type": "NormalizeImage"
34
+ },
35
+ {
36
+ "type": "Permute"
37
+ }
38
+ ],
39
+ "label_list": [
40
+ "abstract",
41
+ "algorithm",
42
+ "aside_text",
43
+ "chart",
44
+ "content",
45
+ "display_formula",
46
+ "doc_title",
47
+ "figure_title",
48
+ "footer",
49
+ "footer_image",
50
+ "footnote",
51
+ "formula_number",
52
+ "header",
53
+ "header_image",
54
+ "image",
55
+ "inline_formula",
56
+ "number",
57
+ "paragraph_title",
58
+ "reference",
59
+ "reference_content",
60
+ "seal",
61
+ "table",
62
+ "text",
63
+ "vertical_text",
64
+ "vision_footnote"
65
+ ],
66
+ "Hpi": {
67
+ "backend_configs": {
68
+ "paddle_infer": {
69
+ "trt_dynamic_shapes": {
70
+ "image": [
71
+ [
72
+ 1,
73
+ 3,
74
+ 800,
75
+ 800
76
+ ],
77
+ [
78
+ 1,
79
+ 3,
80
+ 800,
81
+ 800
82
+ ],
83
+ [
84
+ 8,
85
+ 3,
86
+ 800,
87
+ 800
88
+ ]
89
+ ],
90
+ "scale_factor": [
91
+ [
92
+ 1,
93
+ 2
94
+ ],
95
+ [
96
+ 1,
97
+ 2
98
+ ],
99
+ [
100
+ 8,
101
+ 2
102
+ ]
103
+ ]
104
+ },
105
+ "trt_dynamic_shape_input_data": {
106
+ "scale_factor": [
107
+ [
108
+ 2,
109
+ 2
110
+ ],
111
+ [
112
+ 1,
113
+ 1
114
+ ],
115
+ [
116
+ 0.67,
117
+ 0.67,
118
+ 0.67,
119
+ 0.67,
120
+ 0.67,
121
+ 0.67,
122
+ 0.67,
123
+ 0.67,
124
+ 0.67,
125
+ 0.67,
126
+ 0.67,
127
+ 0.67,
128
+ 0.67,
129
+ 0.67,
130
+ 0.67,
131
+ 0.67
132
+ ]
133
+ ]
134
+ }
135
+ },
136
+ "tensorrt": {
137
+ "dynamic_shapes": {
138
+ "image": [
139
+ [
140
+ 1,
141
+ 3,
142
+ 800,
143
+ 800
144
+ ],
145
+ [
146
+ 1,
147
+ 3,
148
+ 800,
149
+ 800
150
+ ],
151
+ [
152
+ 8,
153
+ 3,
154
+ 800,
155
+ 800
156
+ ]
157
+ ],
158
+ "scale_factor": [
159
+ [
160
+ 1,
161
+ 2
162
+ ],
163
+ [
164
+ 1,
165
+ 2
166
+ ],
167
+ [
168
+ 8,
169
+ 2
170
+ ]
171
+ ]
172
+ }
173
+ }
174
+ }
175
+ }
176
+ }
PP-DocLayoutV2/inference.pdiparams ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45404a84c9fdf91d7bbc94bd47ac4c03649bda84167de04c62bff4726657869a
3
+ size 212170944
PP-DocLayoutV2/inference.pdmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fddd4b4359b95e6f1dd86c86f05e9516517bead9089287681838cdcbf003563b
3
+ size 1515181
PP-DocLayoutV2/inference.yml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mode: paddle
2
+ draw_threshold: 0.5
3
+ metric: COCO
4
+ use_dynamic_shape: false
5
+ Global:
6
+ model_name: PP-DocLayoutV2
7
+ arch: DETR
8
+ min_subgraph_size: 3
9
+ Preprocess:
10
+ - interp: 2
11
+ keep_ratio: false
12
+ target_size:
13
+ - 800
14
+ - 800
15
+ type: Resize
16
+ - mean:
17
+ - 0.0
18
+ - 0.0
19
+ - 0.0
20
+ norm_type: none
21
+ std:
22
+ - 1.0
23
+ - 1.0
24
+ - 1.0
25
+ type: NormalizeImage
26
+ - type: Permute
27
+ label_list:
28
+ - abstract
29
+ - algorithm
30
+ - aside_text
31
+ - chart
32
+ - content
33
+ - display_formula
34
+ - doc_title
35
+ - figure_title
36
+ - footer
37
+ - footer_image
38
+ - footnote
39
+ - formula_number
40
+ - header
41
+ - header_image
42
+ - image
43
+ - inline_formula
44
+ - number
45
+ - paragraph_title
46
+ - reference
47
+ - reference_content
48
+ - seal
49
+ - table
50
+ - text
51
+ - vertical_text
52
+ - vision_footnote
53
+ Hpi:
54
+ backend_configs:
55
+ paddle_infer:
56
+ trt_dynamic_shapes: &id001
57
+ image:
58
+ - - 1
59
+ - 3
60
+ - 800
61
+ - 800
62
+ - - 1
63
+ - 3
64
+ - 800
65
+ - 800
66
+ - - 8
67
+ - 3
68
+ - 800
69
+ - 800
70
+ scale_factor:
71
+ - - 1
72
+ - 2
73
+ - - 1
74
+ - 2
75
+ - - 8
76
+ - 2
77
+ trt_dynamic_shape_input_data:
78
+ scale_factor:
79
+ - - 2
80
+ - 2
81
+ - - 1
82
+ - 1
83
+ - - 0.67
84
+ - 0.67
85
+ - 0.67
86
+ - 0.67
87
+ - 0.67
88
+ - 0.67
89
+ - 0.67
90
+ - 0.67
91
+ - 0.67
92
+ - 0.67
93
+ - 0.67
94
+ - 0.67
95
+ - 0.67
96
+ - 0.67
97
+ - 0.67
98
+ - 0.67
99
+ tensorrt:
100
+ dynamic_shapes: *id001
README.md ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ pipeline_tag: image-text-to-text
4
+ tags:
5
+ - ERNIE4.5
6
+ - PaddleOCR
7
+ - PaddlePaddle
8
+ - image-to-text
9
+ - ocr
10
+ - document-parse
11
+ - layout
12
+ - table
13
+ - formula
14
+ - chart
15
+ base_model: baidu/ERNIE-4.5-0.3B-Paddle
16
+ language:
17
+ - en
18
+ - zh
19
+ - multilingual
20
+ library_name: PaddleOCR
21
+ ---
22
+
23
+ <div align="center">
24
+
25
+
26
+ <h1 align="center">
27
+
28
+ PaddleOCR-VL: Boosting Multilingual Document Parsing via a 0.9B Ultra-Compact Vision-Language Model
29
+
30
+ </h1>
31
+
32
+ [![repo](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
33
+ [![HuggingFace](https://img.shields.io/badge/HuggingFace-black.svg?logo=&labelColor=white)](https://huggingface.co/PaddlePaddle/PaddleOCR-VL)
34
+ [![ModelScope](https://img.shields.io/badge/ModelScope-black?logo=&labelColor=white)](https://modelscope.cn/models/PaddlePaddle/PaddleOCR-VL)
35
+ [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-black.svg?logo=&labelColor=white)](https://huggingface.co/spaces/PaddlePaddle/PaddleOCR-VL_Online_Demo)
36
+ [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-black?logo=&labelColor=white)](https://modelscope.cn/studios/PaddlePaddle/PaddleOCR-VL_Online_Demo/summary)
37
+ [![Discord](https://img.shields.io/badge/Discord-ERNIE-5865F2?logo=discord&logoColor=white)](https://discord.gg/JPmZXDsEEK)
38
+ [![X](https://img.shields.io/badge/X-PaddlePaddle-6080F0)](https://x.com/PaddlePaddle)
39
+ [![License](https://img.shields.io/badge/license-Apache_2.0-green)](./LICENSE)
40
+
41
+ **🔥 Official Demo**: [Baidu AI Studio](https://aistudio.baidu.com/application/detail/98365) |
42
+ **📝 arXiv**: [Technical Report](https://arxiv.org/pdf/2510.14528)
43
+
44
+ </div>
45
+
46
+ <div align="center">
47
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/allmetric.png" width="800"/>
48
+ </div>
49
+
50
+ ## Introduction
51
+
52
+ **PaddleOCR-VL** is a SOTA and resource-efficient model tailored for document parsing. Its core component is PaddleOCR-VL-0.9B, a compact yet powerful vision-language model (VLM) that integrates a NaViT-style dynamic resolution visual encoder with the ERNIE-4.5-0.3B language model to enable accurate element recognition. This innovative model efficiently supports 109 languages and excels in recognizing complex elements (e.g., text, tables, formulas, and charts), while maintaining minimal resource consumption. Through comprehensive evaluations on widely used public benchmarks and in-house benchmarks, PaddleOCR-VL achieves SOTA performance in both page-level document parsing and element-level recognition. It significantly outperforms existing solutions, exhibits strong competitiveness against top-tier VLMs, and delivers fast inference speeds. These strengths make it highly suitable for practical deployment in real-world scenarios.
53
+
54
+ ### **Core Features**
55
+
56
+ 1. **Compact yet Powerful VLM Architecture:** We present a novel vision-language model that is specifically designed for resource-efficient inference, achieving outstanding performance in element recognition. By integrating a NaViT-style dynamic high-resolution visual encoder with the lightweight ERNIE-4.5-0.3B language model, we significantly enhance the model’s recognition capabilities and decoding efficiency. This integration maintains high accuracy while reducing computational demands, making it well-suited for efficient and practical document processing applications.
57
+
58
+
59
+ 2. **SOTA Performance on Document Parsing:** PaddleOCR-VL achieves state-of-the-art performance in both page-level document parsing and element-level recognition. It significantly outperforms existing pipeline-based solutions and exhibiting strong competitiveness against leading vision-language models (VLMs) in document parsing. Moreover, it excels in recognizing complex document elements, such as text, tables, formulas, and charts, making it suitable for a wide range of challenging content types, including handwritten text and historical documents. This makes it highly versatile and suitable for a wide range of document types and scenarios.
60
+
61
+
62
+ 3. **Multilingual Support:** PaddleOCR-VL Supports 109 languages, covering major global languages, including but not limited to Chinese, English, Japanese, Latin, and Korean, as well as languages with different scripts and structures, such as Russian (Cyrillic script), Arabic, Hindi (Devanagari script), and Thai. This broad language coverage substantially enhances the applicability of our system to multilingual and globalized document processing scenarios.
63
+
64
+
65
+ ### **Model Architecture**
66
+
67
+ <!-- PaddleOCR-VL decomposes the complex task of document parsing into a two stages. The first stage, PP-DocLayoutV2, is responsible for layout analysis, where it localizes semantic regions and predicts their reading order. Subsequently, the second stage, PaddleOCR-VL-0.9B, leverages these layout predictions to perform fine-grained recognition of diverse content, including text, tables, formulas, and charts. Finally, a lightweight post-processing module aggregates the outputs from both stages and formats the final document into structured Markdown and JSON. -->
68
+
69
+ <div align="center">
70
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/paddleocrvl.png" width="800"/>
71
+ </div>
72
+
73
+
74
+ ## News
75
+ * ```2025.10.16``` 🚀 We release [PaddleOCR-VL](https://github.com/PaddlePaddle/PaddleOCR), — a multilingual documents parsing via a 0.9B Ultra-Compact Vision-Language Model with SOTA performance.
76
+ * ```2025.10.29``` Supports calling the core module PaddleOCR-VL-0.9B of PaddleOCR-VL via the `transformers` library.
77
+
78
+
79
+ ## Usage
80
+
81
+ ### Install Dependencies
82
+
83
+ Install [PaddlePaddle](https://www.paddlepaddle.org.cn/install/quick) and [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR):
84
+
85
+ ```bash
86
+ python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
87
+ python -m pip install -U "paddleocr[doc-parser]"
88
+ python -m pip install https://paddle-whl.bj.bcebos.com/nightly/cu126/safetensors/safetensors-0.6.2.dev0-cp38-abi3-linux_x86_64.whl
89
+ ```
90
+
91
+ > For Windows users, please use WSL or a Docker container.
92
+
93
+
94
+ ### Basic Usage
95
+
96
+ CLI usage:
97
+
98
+ ```bash
99
+ paddleocr doc_parser -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png
100
+ ```
101
+
102
+ Python API usage:
103
+
104
+ ```python
105
+ from paddleocr import PaddleOCRVL
106
+ pipeline = PaddleOCRVL()
107
+ output = pipeline.predict("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png")
108
+ for res in output:
109
+ res.print()
110
+ res.save_to_json(save_path="output")
111
+ res.save_to_markdown(save_path="output")
112
+ ```
113
+
114
+ ### Accelerate VLM Inference via Optimized Inference Servers
115
+
116
+ 1. Start the VLM inference server (the default port is `8080`):
117
+
118
+ ```bash
119
+ docker run \
120
+ --rm \
121
+ --gpus all \
122
+ --network host \
123
+ ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlex-genai-vllm-server
124
+ ```
125
+ 2. Call the PaddleOCR CLI or Python API:
126
+
127
+ ```bash
128
+ paddleocr doc_parser \
129
+ -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png \
130
+ --vl_rec_backend vllm-server \
131
+ --vl_rec_server_url http://127.0.0.1:8080/v1
132
+ ```
133
+ ```python
134
+ from paddleocr import PaddleOCRVL
135
+ pipeline = PaddleOCRVL(vl_rec_backend="vllm-server", vl_rec_server_url="http://127.0.0.1:8080/v1")
136
+ output = pipeline.predict("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png")
137
+ for res in output:
138
+ res.print()
139
+ res.save_to_json(save_path="output")
140
+ res.save_to_markdown(save_path="output")
141
+ ```
142
+
143
+ **For more usage details and parameter explanations, see the [documentation](https://www.paddleocr.ai/latest/en/version3.x/pipeline_usage/PaddleOCR-VL.html).**
144
+
145
+ ## PaddleOCR-VL-0.9B Usage with transformers
146
+
147
+ Currently, we support inference using the PaddleOCR-VL-0.9B model with the `transformers` library, which can recognize texts, formulas, tables, and chart elements. In the future, we plan to support full document parsing inference with `transformers`. Below is a simple script we provide to support inference using the PaddleOCR-VL-0.9B model with `transformers`.
148
+
149
+ > [!NOTE]
150
+ > Note: We currently recommend using the official method for inference, as it is faster and supports page-level document parsing. The example code below only supports element-level recognition.
151
+
152
+ ```python
153
+ from PIL import Image
154
+ import torch
155
+ from transformers import AutoModelForCausalLM, AutoProcessor
156
+
157
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
158
+
159
+ CHOSEN_TASK = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula'
160
+ PROMPTS = {
161
+ "ocr": "OCR:",
162
+ "table": "Table Recognition:",
163
+ "formula": "Formula Recognition:",
164
+ "chart": "Chart Recognition:",
165
+ }
166
+
167
+ model_path = "PaddlePaddle/PaddleOCR-VL"
168
+ image_path = "test.png"
169
+ image = Image.open(image_path).convert("RGB")
170
+
171
+ model = AutoModelForCausalLM.from_pretrained(
172
+ model_path, trust_remote_code=True, torch_dtype=torch.bfloat16
173
+ ).to(DEVICE).eval()
174
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
175
+
176
+ messages = [
177
+ {"role": "user",
178
+ "content": [
179
+ {"type": "image", "image": image},
180
+ {"type": "text", "text": PROMPTS[CHOSEN_TASK]},
181
+ ]
182
+ }
183
+ ]
184
+ inputs = processor.apply_chat_template(
185
+ messages,
186
+ tokenize=True,
187
+ add_generation_prompt=True,
188
+ return_dict=True,
189
+ return_tensors="pt"
190
+ ).to(DEVICE)
191
+
192
+ outputs = model.generate(**inputs, max_new_tokens=1024)
193
+ outputs = processor.batch_decode(outputs, skip_special_tokens=True)[0]
194
+ print(outputs)
195
+ ```
196
+
197
+ ## Performance
198
+
199
+ ### Page-Level Document Parsing
200
+
201
+
202
+ #### 1. OmniDocBench v1.5
203
+
204
+ ##### PaddleOCR-VL achieves SOTA performance for overall, text, formula, tables and reading order on OmniDocBench v1.5
205
+
206
+ <div align="center">
207
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/omni15.png" width="800"/>
208
+ </div>
209
+
210
+
211
+
212
+ #### 2. OmniDocBench v1.0
213
+
214
+ ##### PaddleOCR-VL achieves SOTA performance for almost all metrics of overall, text, formula, tables and reading order on OmniDocBench v1.0
215
+
216
+
217
+ <div align="center">
218
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/omni10.png" width="800"/>
219
+ </div>
220
+
221
+
222
+ > **Notes:**
223
+ > - The metrics are from [MinerU](https://github.com/opendatalab/MinerU), [OmniDocBench](https://github.com/opendatalab/OmniDocBench), and our own internal evaluations.
224
+
225
+
226
+ ### Element-level Recognition
227
+
228
+ #### 1. Text
229
+
230
+ **Comparison of OmniDocBench-OCR-block Performance**
231
+
232
+ PaddleOCR-VL’s robust and versatile capability in handling diverse document types, establishing it as the leading method in the OmniDocBench-OCR-block performance evaluation.
233
+
234
+ <div align="center">
235
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/omnibenchocr.png" width="800"/>
236
+ </div>
237
+
238
+
239
+ **Comparison of In-house-OCR Performance**
240
+
241
+ In-house-OCR provides a evaluation of performance across multiple languages and text types. Our model demonstrates outstanding accuracy with the lowest edit distances in all evaluated scripts.
242
+
243
+ <div align="center">
244
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/inhouseocr.png" width="800"/>
245
+ </div>
246
+
247
+
248
+
249
+ #### 2. Table
250
+
251
+ **Comparison of In-house-Table Performance**
252
+
253
+ Our self-built evaluation set contains diverse types of table images, such as Chinese, English, mixed Chinese-English, and tables with various characteristics like full, partial, or no borders, book/manual formats, lists, academic papers, merged cells, as well as low-quality, watermarked, etc. PaddleOCR-VL achieves remarkable performance across all categories.
254
+
255
+ <div align="center">
256
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/inhousetable.png" width="600"/>
257
+ </div>
258
+
259
+ #### 3. Formula
260
+
261
+ **Comparison of In-house-Formula Performance**
262
+
263
+ In-house-Formula evaluation set contains simple prints, complex prints, camera scans, and handwritten formulas. PaddleOCR-VL demonstrates the best performance in every category.
264
+
265
+ <div align="center">
266
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/inhouse-formula.png" width="500"/>
267
+ </div>
268
+
269
+
270
+ #### 4. Chart
271
+
272
+ **Comparison of In-house-Chart Performance**
273
+
274
+ The evaluation set is broadly categorized into 11 chart categories, including bar-line hybrid, pie, 100% stacked bar, area, bar, bubble, histogram, line, scatterplot, stacked area, and stacked bar. PaddleOCR-VL not only outperforms expert OCR VLMs but also surpasses some 72B-level multimodal language models.
275
+
276
+ <div align="center">
277
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/inhousechart.png" width="400"/>
278
+ </div>
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+ ## Visualization
287
+
288
+
289
+ ### Comprehensive Document Parsing
290
+
291
+ <div align="center">
292
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/overview1.jpg" width="600"/>
293
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/overview2.jpg" width="600"/>
294
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/overview3.jpg" width="600"/>
295
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/overview4.jpg" width="600"/>
296
+ </div>
297
+
298
+
299
+ ### Text
300
+
301
+ <div align="center">
302
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/text_english_arabic.jpg" width="300" style="display: inline-block;"/>
303
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/text_handwriting_02.jpg" width="300" style="display: inline-block;"/>
304
+ </div>
305
+
306
+
307
+ ### Table
308
+
309
+ <div align="center">
310
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/table_01.jpg" width="300" style="display: inline-block;"/>
311
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/table_02.jpg" width="300" style="display: inline-block;"/>
312
+ </div>
313
+
314
+
315
+ ### Formula
316
+
317
+ <div align="center">
318
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/formula_EN.jpg" width="300" style="display: inline-block;"/>
319
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/formula_ZH.jpg" width="300" style="display: inline-block;"/>
320
+ </div>
321
+
322
+
323
+ ### Chart
324
+
325
+ <div align="center">
326
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/chart_01.jpg" width="300" style="display: inline-block;"/>
327
+ <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/chart_02.jpg" width="300" style="display: inline-block;"/>
328
+ </div>
329
+
330
+
331
+ ## Acknowledgments
332
+
333
+ We would like to thank [ERNIE](https://github.com/PaddlePaddle/ERNIE), [Keye](https://github.com/Kwai-Keye/Keye), [MinerU](https://github.com/opendatalab/MinerU), [OmniDocBench](https://github.com/opendatalab/OmniDocBench) for providing valuable code, model weights and benchmarks. We also appreciate everyone's contribution to this open-source project!
334
+
335
+ ## Citation
336
+
337
+ If you find PaddleOCR-VL helpful, feel free to give us a star and citation.
338
+
339
+ ```bibtex
340
+ @misc{cui2025paddleocrvlboostingmultilingualdocument,
341
+ title={PaddleOCR-VL: Boosting Multilingual Document Parsing via a 0.9B Ultra-Compact Vision-Language Model},
342
+ author={Cheng Cui and Ting Sun and Suyin Liang and Tingquan Gao and Zelun Zhang and Jiaxuan Liu and Xueqing Wang and Changda Zhou and Hongen Liu and Manhui Lin and Yue Zhang and Yubo Zhang and Handong Zheng and Jing Zhang and Jun Zhang and Yi Liu and Dianhai Yu and Yanjun Ma},
343
+ year={2025},
344
+ eprint={2510.14528},
345
+ archivePrefix={arXiv},
346
+ primaryClass={cs.CV},
347
+ url={https://arxiv.org/abs/2510.14528},
348
+ }
349
+ ```
added_tokens.json ADDED
@@ -0,0 +1,1021 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<ecel>": 101308,
3
+ "<fcel>": 101309,
4
+ "<lcel>": 101311,
5
+ "<nl>": 101313,
6
+ "<ucel>": 101312,
7
+ "<xcel>": 101310,
8
+ "<|AUDIO_PLACEHOLDER|>": 100296,
9
+ "<|CROP_COL_SEP|>": 101301,
10
+ "<|CROP_ROW_SEP|>": 101302,
11
+ "<|IMAGE_END|>": 101306,
12
+ "<|IMAGE_PLACEHOLDER|>": 100295,
13
+ "<|IMAGE_SEP|>": 101303,
14
+ "<|IMAGE_START|>": 101305,
15
+ "<|LOC_0|>": 100297,
16
+ "<|LOC_1000|>": 101297,
17
+ "<|LOC_100|>": 100397,
18
+ "<|LOC_101|>": 100398,
19
+ "<|LOC_102|>": 100399,
20
+ "<|LOC_103|>": 100400,
21
+ "<|LOC_104|>": 100401,
22
+ "<|LOC_105|>": 100402,
23
+ "<|LOC_106|>": 100403,
24
+ "<|LOC_107|>": 100404,
25
+ "<|LOC_108|>": 100405,
26
+ "<|LOC_109|>": 100406,
27
+ "<|LOC_10|>": 100307,
28
+ "<|LOC_110|>": 100407,
29
+ "<|LOC_111|>": 100408,
30
+ "<|LOC_112|>": 100409,
31
+ "<|LOC_113|>": 100410,
32
+ "<|LOC_114|>": 100411,
33
+ "<|LOC_115|>": 100412,
34
+ "<|LOC_116|>": 100413,
35
+ "<|LOC_117|>": 100414,
36
+ "<|LOC_118|>": 100415,
37
+ "<|LOC_119|>": 100416,
38
+ "<|LOC_11|>": 100308,
39
+ "<|LOC_120|>": 100417,
40
+ "<|LOC_121|>": 100418,
41
+ "<|LOC_122|>": 100419,
42
+ "<|LOC_123|>": 100420,
43
+ "<|LOC_124|>": 100421,
44
+ "<|LOC_125|>": 100422,
45
+ "<|LOC_126|>": 100423,
46
+ "<|LOC_127|>": 100424,
47
+ "<|LOC_128|>": 100425,
48
+ "<|LOC_129|>": 100426,
49
+ "<|LOC_12|>": 100309,
50
+ "<|LOC_130|>": 100427,
51
+ "<|LOC_131|>": 100428,
52
+ "<|LOC_132|>": 100429,
53
+ "<|LOC_133|>": 100430,
54
+ "<|LOC_134|>": 100431,
55
+ "<|LOC_135|>": 100432,
56
+ "<|LOC_136|>": 100433,
57
+ "<|LOC_137|>": 100434,
58
+ "<|LOC_138|>": 100435,
59
+ "<|LOC_139|>": 100436,
60
+ "<|LOC_13|>": 100310,
61
+ "<|LOC_140|>": 100437,
62
+ "<|LOC_141|>": 100438,
63
+ "<|LOC_142|>": 100439,
64
+ "<|LOC_143|>": 100440,
65
+ "<|LOC_144|>": 100441,
66
+ "<|LOC_145|>": 100442,
67
+ "<|LOC_146|>": 100443,
68
+ "<|LOC_147|>": 100444,
69
+ "<|LOC_148|>": 100445,
70
+ "<|LOC_149|>": 100446,
71
+ "<|LOC_14|>": 100311,
72
+ "<|LOC_150|>": 100447,
73
+ "<|LOC_151|>": 100448,
74
+ "<|LOC_152|>": 100449,
75
+ "<|LOC_153|>": 100450,
76
+ "<|LOC_154|>": 100451,
77
+ "<|LOC_155|>": 100452,
78
+ "<|LOC_156|>": 100453,
79
+ "<|LOC_157|>": 100454,
80
+ "<|LOC_158|>": 100455,
81
+ "<|LOC_159|>": 100456,
82
+ "<|LOC_15|>": 100312,
83
+ "<|LOC_160|>": 100457,
84
+ "<|LOC_161|>": 100458,
85
+ "<|LOC_162|>": 100459,
86
+ "<|LOC_163|>": 100460,
87
+ "<|LOC_164|>": 100461,
88
+ "<|LOC_165|>": 100462,
89
+ "<|LOC_166|>": 100463,
90
+ "<|LOC_167|>": 100464,
91
+ "<|LOC_168|>": 100465,
92
+ "<|LOC_169|>": 100466,
93
+ "<|LOC_16|>": 100313,
94
+ "<|LOC_170|>": 100467,
95
+ "<|LOC_171|>": 100468,
96
+ "<|LOC_172|>": 100469,
97
+ "<|LOC_173|>": 100470,
98
+ "<|LOC_174|>": 100471,
99
+ "<|LOC_175|>": 100472,
100
+ "<|LOC_176|>": 100473,
101
+ "<|LOC_177|>": 100474,
102
+ "<|LOC_178|>": 100475,
103
+ "<|LOC_179|>": 100476,
104
+ "<|LOC_17|>": 100314,
105
+ "<|LOC_180|>": 100477,
106
+ "<|LOC_181|>": 100478,
107
+ "<|LOC_182|>": 100479,
108
+ "<|LOC_183|>": 100480,
109
+ "<|LOC_184|>": 100481,
110
+ "<|LOC_185|>": 100482,
111
+ "<|LOC_186|>": 100483,
112
+ "<|LOC_187|>": 100484,
113
+ "<|LOC_188|>": 100485,
114
+ "<|LOC_189|>": 100486,
115
+ "<|LOC_18|>": 100315,
116
+ "<|LOC_190|>": 100487,
117
+ "<|LOC_191|>": 100488,
118
+ "<|LOC_192|>": 100489,
119
+ "<|LOC_193|>": 100490,
120
+ "<|LOC_194|>": 100491,
121
+ "<|LOC_195|>": 100492,
122
+ "<|LOC_196|>": 100493,
123
+ "<|LOC_197|>": 100494,
124
+ "<|LOC_198|>": 100495,
125
+ "<|LOC_199|>": 100496,
126
+ "<|LOC_19|>": 100316,
127
+ "<|LOC_1|>": 100298,
128
+ "<|LOC_200|>": 100497,
129
+ "<|LOC_201|>": 100498,
130
+ "<|LOC_202|>": 100499,
131
+ "<|LOC_203|>": 100500,
132
+ "<|LOC_204|>": 100501,
133
+ "<|LOC_205|>": 100502,
134
+ "<|LOC_206|>": 100503,
135
+ "<|LOC_207|>": 100504,
136
+ "<|LOC_208|>": 100505,
137
+ "<|LOC_209|>": 100506,
138
+ "<|LOC_20|>": 100317,
139
+ "<|LOC_210|>": 100507,
140
+ "<|LOC_211|>": 100508,
141
+ "<|LOC_212|>": 100509,
142
+ "<|LOC_213|>": 100510,
143
+ "<|LOC_214|>": 100511,
144
+ "<|LOC_215|>": 100512,
145
+ "<|LOC_216|>": 100513,
146
+ "<|LOC_217|>": 100514,
147
+ "<|LOC_218|>": 100515,
148
+ "<|LOC_219|>": 100516,
149
+ "<|LOC_21|>": 100318,
150
+ "<|LOC_220|>": 100517,
151
+ "<|LOC_221|>": 100518,
152
+ "<|LOC_222|>": 100519,
153
+ "<|LOC_223|>": 100520,
154
+ "<|LOC_224|>": 100521,
155
+ "<|LOC_225|>": 100522,
156
+ "<|LOC_226|>": 100523,
157
+ "<|LOC_227|>": 100524,
158
+ "<|LOC_228|>": 100525,
159
+ "<|LOC_229|>": 100526,
160
+ "<|LOC_22|>": 100319,
161
+ "<|LOC_230|>": 100527,
162
+ "<|LOC_231|>": 100528,
163
+ "<|LOC_232|>": 100529,
164
+ "<|LOC_233|>": 100530,
165
+ "<|LOC_234|>": 100531,
166
+ "<|LOC_235|>": 100532,
167
+ "<|LOC_236|>": 100533,
168
+ "<|LOC_237|>": 100534,
169
+ "<|LOC_238|>": 100535,
170
+ "<|LOC_239|>": 100536,
171
+ "<|LOC_23|>": 100320,
172
+ "<|LOC_240|>": 100537,
173
+ "<|LOC_241|>": 100538,
174
+ "<|LOC_242|>": 100539,
175
+ "<|LOC_243|>": 100540,
176
+ "<|LOC_244|>": 100541,
177
+ "<|LOC_245|>": 100542,
178
+ "<|LOC_246|>": 100543,
179
+ "<|LOC_247|>": 100544,
180
+ "<|LOC_248|>": 100545,
181
+ "<|LOC_249|>": 100546,
182
+ "<|LOC_24|>": 100321,
183
+ "<|LOC_250|>": 100547,
184
+ "<|LOC_251|>": 100548,
185
+ "<|LOC_252|>": 100549,
186
+ "<|LOC_253|>": 100550,
187
+ "<|LOC_254|>": 100551,
188
+ "<|LOC_255|>": 100552,
189
+ "<|LOC_256|>": 100553,
190
+ "<|LOC_257|>": 100554,
191
+ "<|LOC_258|>": 100555,
192
+ "<|LOC_259|>": 100556,
193
+ "<|LOC_25|>": 100322,
194
+ "<|LOC_260|>": 100557,
195
+ "<|LOC_261|>": 100558,
196
+ "<|LOC_262|>": 100559,
197
+ "<|LOC_263|>": 100560,
198
+ "<|LOC_264|>": 100561,
199
+ "<|LOC_265|>": 100562,
200
+ "<|LOC_266|>": 100563,
201
+ "<|LOC_267|>": 100564,
202
+ "<|LOC_268|>": 100565,
203
+ "<|LOC_269|>": 100566,
204
+ "<|LOC_26|>": 100323,
205
+ "<|LOC_270|>": 100567,
206
+ "<|LOC_271|>": 100568,
207
+ "<|LOC_272|>": 100569,
208
+ "<|LOC_273|>": 100570,
209
+ "<|LOC_274|>": 100571,
210
+ "<|LOC_275|>": 100572,
211
+ "<|LOC_276|>": 100573,
212
+ "<|LOC_277|>": 100574,
213
+ "<|LOC_278|>": 100575,
214
+ "<|LOC_279|>": 100576,
215
+ "<|LOC_27|>": 100324,
216
+ "<|LOC_280|>": 100577,
217
+ "<|LOC_281|>": 100578,
218
+ "<|LOC_282|>": 100579,
219
+ "<|LOC_283|>": 100580,
220
+ "<|LOC_284|>": 100581,
221
+ "<|LOC_285|>": 100582,
222
+ "<|LOC_286|>": 100583,
223
+ "<|LOC_287|>": 100584,
224
+ "<|LOC_288|>": 100585,
225
+ "<|LOC_289|>": 100586,
226
+ "<|LOC_28|>": 100325,
227
+ "<|LOC_290|>": 100587,
228
+ "<|LOC_291|>": 100588,
229
+ "<|LOC_292|>": 100589,
230
+ "<|LOC_293|>": 100590,
231
+ "<|LOC_294|>": 100591,
232
+ "<|LOC_295|>": 100592,
233
+ "<|LOC_296|>": 100593,
234
+ "<|LOC_297|>": 100594,
235
+ "<|LOC_298|>": 100595,
236
+ "<|LOC_299|>": 100596,
237
+ "<|LOC_29|>": 100326,
238
+ "<|LOC_2|>": 100299,
239
+ "<|LOC_300|>": 100597,
240
+ "<|LOC_301|>": 100598,
241
+ "<|LOC_302|>": 100599,
242
+ "<|LOC_303|>": 100600,
243
+ "<|LOC_304|>": 100601,
244
+ "<|LOC_305|>": 100602,
245
+ "<|LOC_306|>": 100603,
246
+ "<|LOC_307|>": 100604,
247
+ "<|LOC_308|>": 100605,
248
+ "<|LOC_309|>": 100606,
249
+ "<|LOC_30|>": 100327,
250
+ "<|LOC_310|>": 100607,
251
+ "<|LOC_311|>": 100608,
252
+ "<|LOC_312|>": 100609,
253
+ "<|LOC_313|>": 100610,
254
+ "<|LOC_314|>": 100611,
255
+ "<|LOC_315|>": 100612,
256
+ "<|LOC_316|>": 100613,
257
+ "<|LOC_317|>": 100614,
258
+ "<|LOC_318|>": 100615,
259
+ "<|LOC_319|>": 100616,
260
+ "<|LOC_31|>": 100328,
261
+ "<|LOC_320|>": 100617,
262
+ "<|LOC_321|>": 100618,
263
+ "<|LOC_322|>": 100619,
264
+ "<|LOC_323|>": 100620,
265
+ "<|LOC_324|>": 100621,
266
+ "<|LOC_325|>": 100622,
267
+ "<|LOC_326|>": 100623,
268
+ "<|LOC_327|>": 100624,
269
+ "<|LOC_328|>": 100625,
270
+ "<|LOC_329|>": 100626,
271
+ "<|LOC_32|>": 100329,
272
+ "<|LOC_330|>": 100627,
273
+ "<|LOC_331|>": 100628,
274
+ "<|LOC_332|>": 100629,
275
+ "<|LOC_333|>": 100630,
276
+ "<|LOC_334|>": 100631,
277
+ "<|LOC_335|>": 100632,
278
+ "<|LOC_336|>": 100633,
279
+ "<|LOC_337|>": 100634,
280
+ "<|LOC_338|>": 100635,
281
+ "<|LOC_339|>": 100636,
282
+ "<|LOC_33|>": 100330,
283
+ "<|LOC_340|>": 100637,
284
+ "<|LOC_341|>": 100638,
285
+ "<|LOC_342|>": 100639,
286
+ "<|LOC_343|>": 100640,
287
+ "<|LOC_344|>": 100641,
288
+ "<|LOC_345|>": 100642,
289
+ "<|LOC_346|>": 100643,
290
+ "<|LOC_347|>": 100644,
291
+ "<|LOC_348|>": 100645,
292
+ "<|LOC_349|>": 100646,
293
+ "<|LOC_34|>": 100331,
294
+ "<|LOC_350|>": 100647,
295
+ "<|LOC_351|>": 100648,
296
+ "<|LOC_352|>": 100649,
297
+ "<|LOC_353|>": 100650,
298
+ "<|LOC_354|>": 100651,
299
+ "<|LOC_355|>": 100652,
300
+ "<|LOC_356|>": 100653,
301
+ "<|LOC_357|>": 100654,
302
+ "<|LOC_358|>": 100655,
303
+ "<|LOC_359|>": 100656,
304
+ "<|LOC_35|>": 100332,
305
+ "<|LOC_360|>": 100657,
306
+ "<|LOC_361|>": 100658,
307
+ "<|LOC_362|>": 100659,
308
+ "<|LOC_363|>": 100660,
309
+ "<|LOC_364|>": 100661,
310
+ "<|LOC_365|>": 100662,
311
+ "<|LOC_366|>": 100663,
312
+ "<|LOC_367|>": 100664,
313
+ "<|LOC_368|>": 100665,
314
+ "<|LOC_369|>": 100666,
315
+ "<|LOC_36|>": 100333,
316
+ "<|LOC_370|>": 100667,
317
+ "<|LOC_371|>": 100668,
318
+ "<|LOC_372|>": 100669,
319
+ "<|LOC_373|>": 100670,
320
+ "<|LOC_374|>": 100671,
321
+ "<|LOC_375|>": 100672,
322
+ "<|LOC_376|>": 100673,
323
+ "<|LOC_377|>": 100674,
324
+ "<|LOC_378|>": 100675,
325
+ "<|LOC_379|>": 100676,
326
+ "<|LOC_37|>": 100334,
327
+ "<|LOC_380|>": 100677,
328
+ "<|LOC_381|>": 100678,
329
+ "<|LOC_382|>": 100679,
330
+ "<|LOC_383|>": 100680,
331
+ "<|LOC_384|>": 100681,
332
+ "<|LOC_385|>": 100682,
333
+ "<|LOC_386|>": 100683,
334
+ "<|LOC_387|>": 100684,
335
+ "<|LOC_388|>": 100685,
336
+ "<|LOC_389|>": 100686,
337
+ "<|LOC_38|>": 100335,
338
+ "<|LOC_390|>": 100687,
339
+ "<|LOC_391|>": 100688,
340
+ "<|LOC_392|>": 100689,
341
+ "<|LOC_393|>": 100690,
342
+ "<|LOC_394|>": 100691,
343
+ "<|LOC_395|>": 100692,
344
+ "<|LOC_396|>": 100693,
345
+ "<|LOC_397|>": 100694,
346
+ "<|LOC_398|>": 100695,
347
+ "<|LOC_399|>": 100696,
348
+ "<|LOC_39|>": 100336,
349
+ "<|LOC_3|>": 100300,
350
+ "<|LOC_400|>": 100697,
351
+ "<|LOC_401|>": 100698,
352
+ "<|LOC_402|>": 100699,
353
+ "<|LOC_403|>": 100700,
354
+ "<|LOC_404|>": 100701,
355
+ "<|LOC_405|>": 100702,
356
+ "<|LOC_406|>": 100703,
357
+ "<|LOC_407|>": 100704,
358
+ "<|LOC_408|>": 100705,
359
+ "<|LOC_409|>": 100706,
360
+ "<|LOC_40|>": 100337,
361
+ "<|LOC_410|>": 100707,
362
+ "<|LOC_411|>": 100708,
363
+ "<|LOC_412|>": 100709,
364
+ "<|LOC_413|>": 100710,
365
+ "<|LOC_414|>": 100711,
366
+ "<|LOC_415|>": 100712,
367
+ "<|LOC_416|>": 100713,
368
+ "<|LOC_417|>": 100714,
369
+ "<|LOC_418|>": 100715,
370
+ "<|LOC_419|>": 100716,
371
+ "<|LOC_41|>": 100338,
372
+ "<|LOC_420|>": 100717,
373
+ "<|LOC_421|>": 100718,
374
+ "<|LOC_422|>": 100719,
375
+ "<|LOC_423|>": 100720,
376
+ "<|LOC_424|>": 100721,
377
+ "<|LOC_425|>": 100722,
378
+ "<|LOC_426|>": 100723,
379
+ "<|LOC_427|>": 100724,
380
+ "<|LOC_428|>": 100725,
381
+ "<|LOC_429|>": 100726,
382
+ "<|LOC_42|>": 100339,
383
+ "<|LOC_430|>": 100727,
384
+ "<|LOC_431|>": 100728,
385
+ "<|LOC_432|>": 100729,
386
+ "<|LOC_433|>": 100730,
387
+ "<|LOC_434|>": 100731,
388
+ "<|LOC_435|>": 100732,
389
+ "<|LOC_436|>": 100733,
390
+ "<|LOC_437|>": 100734,
391
+ "<|LOC_438|>": 100735,
392
+ "<|LOC_439|>": 100736,
393
+ "<|LOC_43|>": 100340,
394
+ "<|LOC_440|>": 100737,
395
+ "<|LOC_441|>": 100738,
396
+ "<|LOC_442|>": 100739,
397
+ "<|LOC_443|>": 100740,
398
+ "<|LOC_444|>": 100741,
399
+ "<|LOC_445|>": 100742,
400
+ "<|LOC_446|>": 100743,
401
+ "<|LOC_447|>": 100744,
402
+ "<|LOC_448|>": 100745,
403
+ "<|LOC_449|>": 100746,
404
+ "<|LOC_44|>": 100341,
405
+ "<|LOC_450|>": 100747,
406
+ "<|LOC_451|>": 100748,
407
+ "<|LOC_452|>": 100749,
408
+ "<|LOC_453|>": 100750,
409
+ "<|LOC_454|>": 100751,
410
+ "<|LOC_455|>": 100752,
411
+ "<|LOC_456|>": 100753,
412
+ "<|LOC_457|>": 100754,
413
+ "<|LOC_458|>": 100755,
414
+ "<|LOC_459|>": 100756,
415
+ "<|LOC_45|>": 100342,
416
+ "<|LOC_460|>": 100757,
417
+ "<|LOC_461|>": 100758,
418
+ "<|LOC_462|>": 100759,
419
+ "<|LOC_463|>": 100760,
420
+ "<|LOC_464|>": 100761,
421
+ "<|LOC_465|>": 100762,
422
+ "<|LOC_466|>": 100763,
423
+ "<|LOC_467|>": 100764,
424
+ "<|LOC_468|>": 100765,
425
+ "<|LOC_469|>": 100766,
426
+ "<|LOC_46|>": 100343,
427
+ "<|LOC_470|>": 100767,
428
+ "<|LOC_471|>": 100768,
429
+ "<|LOC_472|>": 100769,
430
+ "<|LOC_473|>": 100770,
431
+ "<|LOC_474|>": 100771,
432
+ "<|LOC_475|>": 100772,
433
+ "<|LOC_476|>": 100773,
434
+ "<|LOC_477|>": 100774,
435
+ "<|LOC_478|>": 100775,
436
+ "<|LOC_479|>": 100776,
437
+ "<|LOC_47|>": 100344,
438
+ "<|LOC_480|>": 100777,
439
+ "<|LOC_481|>": 100778,
440
+ "<|LOC_482|>": 100779,
441
+ "<|LOC_483|>": 100780,
442
+ "<|LOC_484|>": 100781,
443
+ "<|LOC_485|>": 100782,
444
+ "<|LOC_486|>": 100783,
445
+ "<|LOC_487|>": 100784,
446
+ "<|LOC_488|>": 100785,
447
+ "<|LOC_489|>": 100786,
448
+ "<|LOC_48|>": 100345,
449
+ "<|LOC_490|>": 100787,
450
+ "<|LOC_491|>": 100788,
451
+ "<|LOC_492|>": 100789,
452
+ "<|LOC_493|>": 100790,
453
+ "<|LOC_494|>": 100791,
454
+ "<|LOC_495|>": 100792,
455
+ "<|LOC_496|>": 100793,
456
+ "<|LOC_497|>": 100794,
457
+ "<|LOC_498|>": 100795,
458
+ "<|LOC_499|>": 100796,
459
+ "<|LOC_49|>": 100346,
460
+ "<|LOC_4|>": 100301,
461
+ "<|LOC_500|>": 100797,
462
+ "<|LOC_501|>": 100798,
463
+ "<|LOC_502|>": 100799,
464
+ "<|LOC_503|>": 100800,
465
+ "<|LOC_504|>": 100801,
466
+ "<|LOC_505|>": 100802,
467
+ "<|LOC_506|>": 100803,
468
+ "<|LOC_507|>": 100804,
469
+ "<|LOC_508|>": 100805,
470
+ "<|LOC_509|>": 100806,
471
+ "<|LOC_50|>": 100347,
472
+ "<|LOC_510|>": 100807,
473
+ "<|LOC_511|>": 100808,
474
+ "<|LOC_512|>": 100809,
475
+ "<|LOC_513|>": 100810,
476
+ "<|LOC_514|>": 100811,
477
+ "<|LOC_515|>": 100812,
478
+ "<|LOC_516|>": 100813,
479
+ "<|LOC_517|>": 100814,
480
+ "<|LOC_518|>": 100815,
481
+ "<|LOC_519|>": 100816,
482
+ "<|LOC_51|>": 100348,
483
+ "<|LOC_520|>": 100817,
484
+ "<|LOC_521|>": 100818,
485
+ "<|LOC_522|>": 100819,
486
+ "<|LOC_523|>": 100820,
487
+ "<|LOC_524|>": 100821,
488
+ "<|LOC_525|>": 100822,
489
+ "<|LOC_526|>": 100823,
490
+ "<|LOC_527|>": 100824,
491
+ "<|LOC_528|>": 100825,
492
+ "<|LOC_529|>": 100826,
493
+ "<|LOC_52|>": 100349,
494
+ "<|LOC_530|>": 100827,
495
+ "<|LOC_531|>": 100828,
496
+ "<|LOC_532|>": 100829,
497
+ "<|LOC_533|>": 100830,
498
+ "<|LOC_534|>": 100831,
499
+ "<|LOC_535|>": 100832,
500
+ "<|LOC_536|>": 100833,
501
+ "<|LOC_537|>": 100834,
502
+ "<|LOC_538|>": 100835,
503
+ "<|LOC_539|>": 100836,
504
+ "<|LOC_53|>": 100350,
505
+ "<|LOC_540|>": 100837,
506
+ "<|LOC_541|>": 100838,
507
+ "<|LOC_542|>": 100839,
508
+ "<|LOC_543|>": 100840,
509
+ "<|LOC_544|>": 100841,
510
+ "<|LOC_545|>": 100842,
511
+ "<|LOC_546|>": 100843,
512
+ "<|LOC_547|>": 100844,
513
+ "<|LOC_548|>": 100845,
514
+ "<|LOC_549|>": 100846,
515
+ "<|LOC_54|>": 100351,
516
+ "<|LOC_550|>": 100847,
517
+ "<|LOC_551|>": 100848,
518
+ "<|LOC_552|>": 100849,
519
+ "<|LOC_553|>": 100850,
520
+ "<|LOC_554|>": 100851,
521
+ "<|LOC_555|>": 100852,
522
+ "<|LOC_556|>": 100853,
523
+ "<|LOC_557|>": 100854,
524
+ "<|LOC_558|>": 100855,
525
+ "<|LOC_559|>": 100856,
526
+ "<|LOC_55|>": 100352,
527
+ "<|LOC_560|>": 100857,
528
+ "<|LOC_561|>": 100858,
529
+ "<|LOC_562|>": 100859,
530
+ "<|LOC_563|>": 100860,
531
+ "<|LOC_564|>": 100861,
532
+ "<|LOC_565|>": 100862,
533
+ "<|LOC_566|>": 100863,
534
+ "<|LOC_567|>": 100864,
535
+ "<|LOC_568|>": 100865,
536
+ "<|LOC_569|>": 100866,
537
+ "<|LOC_56|>": 100353,
538
+ "<|LOC_570|>": 100867,
539
+ "<|LOC_571|>": 100868,
540
+ "<|LOC_572|>": 100869,
541
+ "<|LOC_573|>": 100870,
542
+ "<|LOC_574|>": 100871,
543
+ "<|LOC_575|>": 100872,
544
+ "<|LOC_576|>": 100873,
545
+ "<|LOC_577|>": 100874,
546
+ "<|LOC_578|>": 100875,
547
+ "<|LOC_579|>": 100876,
548
+ "<|LOC_57|>": 100354,
549
+ "<|LOC_580|>": 100877,
550
+ "<|LOC_581|>": 100878,
551
+ "<|LOC_582|>": 100879,
552
+ "<|LOC_583|>": 100880,
553
+ "<|LOC_584|>": 100881,
554
+ "<|LOC_585|>": 100882,
555
+ "<|LOC_586|>": 100883,
556
+ "<|LOC_587|>": 100884,
557
+ "<|LOC_588|>": 100885,
558
+ "<|LOC_589|>": 100886,
559
+ "<|LOC_58|>": 100355,
560
+ "<|LOC_590|>": 100887,
561
+ "<|LOC_591|>": 100888,
562
+ "<|LOC_592|>": 100889,
563
+ "<|LOC_593|>": 100890,
564
+ "<|LOC_594|>": 100891,
565
+ "<|LOC_595|>": 100892,
566
+ "<|LOC_596|>": 100893,
567
+ "<|LOC_597|>": 100894,
568
+ "<|LOC_598|>": 100895,
569
+ "<|LOC_599|>": 100896,
570
+ "<|LOC_59|>": 100356,
571
+ "<|LOC_5|>": 100302,
572
+ "<|LOC_600|>": 100897,
573
+ "<|LOC_601|>": 100898,
574
+ "<|LOC_602|>": 100899,
575
+ "<|LOC_603|>": 100900,
576
+ "<|LOC_604|>": 100901,
577
+ "<|LOC_605|>": 100902,
578
+ "<|LOC_606|>": 100903,
579
+ "<|LOC_607|>": 100904,
580
+ "<|LOC_608|>": 100905,
581
+ "<|LOC_609|>": 100906,
582
+ "<|LOC_60|>": 100357,
583
+ "<|LOC_610|>": 100907,
584
+ "<|LOC_611|>": 100908,
585
+ "<|LOC_612|>": 100909,
586
+ "<|LOC_613|>": 100910,
587
+ "<|LOC_614|>": 100911,
588
+ "<|LOC_615|>": 100912,
589
+ "<|LOC_616|>": 100913,
590
+ "<|LOC_617|>": 100914,
591
+ "<|LOC_618|>": 100915,
592
+ "<|LOC_619|>": 100916,
593
+ "<|LOC_61|>": 100358,
594
+ "<|LOC_620|>": 100917,
595
+ "<|LOC_621|>": 100918,
596
+ "<|LOC_622|>": 100919,
597
+ "<|LOC_623|>": 100920,
598
+ "<|LOC_624|>": 100921,
599
+ "<|LOC_625|>": 100922,
600
+ "<|LOC_626|>": 100923,
601
+ "<|LOC_627|>": 100924,
602
+ "<|LOC_628|>": 100925,
603
+ "<|LOC_629|>": 100926,
604
+ "<|LOC_62|>": 100359,
605
+ "<|LOC_630|>": 100927,
606
+ "<|LOC_631|>": 100928,
607
+ "<|LOC_632|>": 100929,
608
+ "<|LOC_633|>": 100930,
609
+ "<|LOC_634|>": 100931,
610
+ "<|LOC_635|>": 100932,
611
+ "<|LOC_636|>": 100933,
612
+ "<|LOC_637|>": 100934,
613
+ "<|LOC_638|>": 100935,
614
+ "<|LOC_639|>": 100936,
615
+ "<|LOC_63|>": 100360,
616
+ "<|LOC_640|>": 100937,
617
+ "<|LOC_641|>": 100938,
618
+ "<|LOC_642|>": 100939,
619
+ "<|LOC_643|>": 100940,
620
+ "<|LOC_644|>": 100941,
621
+ "<|LOC_645|>": 100942,
622
+ "<|LOC_646|>": 100943,
623
+ "<|LOC_647|>": 100944,
624
+ "<|LOC_648|>": 100945,
625
+ "<|LOC_649|>": 100946,
626
+ "<|LOC_64|>": 100361,
627
+ "<|LOC_650|>": 100947,
628
+ "<|LOC_651|>": 100948,
629
+ "<|LOC_652|>": 100949,
630
+ "<|LOC_653|>": 100950,
631
+ "<|LOC_654|>": 100951,
632
+ "<|LOC_655|>": 100952,
633
+ "<|LOC_656|>": 100953,
634
+ "<|LOC_657|>": 100954,
635
+ "<|LOC_658|>": 100955,
636
+ "<|LOC_659|>": 100956,
637
+ "<|LOC_65|>": 100362,
638
+ "<|LOC_660|>": 100957,
639
+ "<|LOC_661|>": 100958,
640
+ "<|LOC_662|>": 100959,
641
+ "<|LOC_663|>": 100960,
642
+ "<|LOC_664|>": 100961,
643
+ "<|LOC_665|>": 100962,
644
+ "<|LOC_666|>": 100963,
645
+ "<|LOC_667|>": 100964,
646
+ "<|LOC_668|>": 100965,
647
+ "<|LOC_669|>": 100966,
648
+ "<|LOC_66|>": 100363,
649
+ "<|LOC_670|>": 100967,
650
+ "<|LOC_671|>": 100968,
651
+ "<|LOC_672|>": 100969,
652
+ "<|LOC_673|>": 100970,
653
+ "<|LOC_674|>": 100971,
654
+ "<|LOC_675|>": 100972,
655
+ "<|LOC_676|>": 100973,
656
+ "<|LOC_677|>": 100974,
657
+ "<|LOC_678|>": 100975,
658
+ "<|LOC_679|>": 100976,
659
+ "<|LOC_67|>": 100364,
660
+ "<|LOC_680|>": 100977,
661
+ "<|LOC_681|>": 100978,
662
+ "<|LOC_682|>": 100979,
663
+ "<|LOC_683|>": 100980,
664
+ "<|LOC_684|>": 100981,
665
+ "<|LOC_685|>": 100982,
666
+ "<|LOC_686|>": 100983,
667
+ "<|LOC_687|>": 100984,
668
+ "<|LOC_688|>": 100985,
669
+ "<|LOC_689|>": 100986,
670
+ "<|LOC_68|>": 100365,
671
+ "<|LOC_690|>": 100987,
672
+ "<|LOC_691|>": 100988,
673
+ "<|LOC_692|>": 100989,
674
+ "<|LOC_693|>": 100990,
675
+ "<|LOC_694|>": 100991,
676
+ "<|LOC_695|>": 100992,
677
+ "<|LOC_696|>": 100993,
678
+ "<|LOC_697|>": 100994,
679
+ "<|LOC_698|>": 100995,
680
+ "<|LOC_699|>": 100996,
681
+ "<|LOC_69|>": 100366,
682
+ "<|LOC_6|>": 100303,
683
+ "<|LOC_700|>": 100997,
684
+ "<|LOC_701|>": 100998,
685
+ "<|LOC_702|>": 100999,
686
+ "<|LOC_703|>": 101000,
687
+ "<|LOC_704|>": 101001,
688
+ "<|LOC_705|>": 101002,
689
+ "<|LOC_706|>": 101003,
690
+ "<|LOC_707|>": 101004,
691
+ "<|LOC_708|>": 101005,
692
+ "<|LOC_709|>": 101006,
693
+ "<|LOC_70|>": 100367,
694
+ "<|LOC_710|>": 101007,
695
+ "<|LOC_711|>": 101008,
696
+ "<|LOC_712|>": 101009,
697
+ "<|LOC_713|>": 101010,
698
+ "<|LOC_714|>": 101011,
699
+ "<|LOC_715|>": 101012,
700
+ "<|LOC_716|>": 101013,
701
+ "<|LOC_717|>": 101014,
702
+ "<|LOC_718|>": 101015,
703
+ "<|LOC_719|>": 101016,
704
+ "<|LOC_71|>": 100368,
705
+ "<|LOC_720|>": 101017,
706
+ "<|LOC_721|>": 101018,
707
+ "<|LOC_722|>": 101019,
708
+ "<|LOC_723|>": 101020,
709
+ "<|LOC_724|>": 101021,
710
+ "<|LOC_725|>": 101022,
711
+ "<|LOC_726|>": 101023,
712
+ "<|LOC_727|>": 101024,
713
+ "<|LOC_728|>": 101025,
714
+ "<|LOC_729|>": 101026,
715
+ "<|LOC_72|>": 100369,
716
+ "<|LOC_730|>": 101027,
717
+ "<|LOC_731|>": 101028,
718
+ "<|LOC_732|>": 101029,
719
+ "<|LOC_733|>": 101030,
720
+ "<|LOC_734|>": 101031,
721
+ "<|LOC_735|>": 101032,
722
+ "<|LOC_736|>": 101033,
723
+ "<|LOC_737|>": 101034,
724
+ "<|LOC_738|>": 101035,
725
+ "<|LOC_739|>": 101036,
726
+ "<|LOC_73|>": 100370,
727
+ "<|LOC_740|>": 101037,
728
+ "<|LOC_741|>": 101038,
729
+ "<|LOC_742|>": 101039,
730
+ "<|LOC_743|>": 101040,
731
+ "<|LOC_744|>": 101041,
732
+ "<|LOC_745|>": 101042,
733
+ "<|LOC_746|>": 101043,
734
+ "<|LOC_747|>": 101044,
735
+ "<|LOC_748|>": 101045,
736
+ "<|LOC_749|>": 101046,
737
+ "<|LOC_74|>": 100371,
738
+ "<|LOC_750|>": 101047,
739
+ "<|LOC_751|>": 101048,
740
+ "<|LOC_752|>": 101049,
741
+ "<|LOC_753|>": 101050,
742
+ "<|LOC_754|>": 101051,
743
+ "<|LOC_755|>": 101052,
744
+ "<|LOC_756|>": 101053,
745
+ "<|LOC_757|>": 101054,
746
+ "<|LOC_758|>": 101055,
747
+ "<|LOC_759|>": 101056,
748
+ "<|LOC_75|>": 100372,
749
+ "<|LOC_760|>": 101057,
750
+ "<|LOC_761|>": 101058,
751
+ "<|LOC_762|>": 101059,
752
+ "<|LOC_763|>": 101060,
753
+ "<|LOC_764|>": 101061,
754
+ "<|LOC_765|>": 101062,
755
+ "<|LOC_766|>": 101063,
756
+ "<|LOC_767|>": 101064,
757
+ "<|LOC_768|>": 101065,
758
+ "<|LOC_769|>": 101066,
759
+ "<|LOC_76|>": 100373,
760
+ "<|LOC_770|>": 101067,
761
+ "<|LOC_771|>": 101068,
762
+ "<|LOC_772|>": 101069,
763
+ "<|LOC_773|>": 101070,
764
+ "<|LOC_774|>": 101071,
765
+ "<|LOC_775|>": 101072,
766
+ "<|LOC_776|>": 101073,
767
+ "<|LOC_777|>": 101074,
768
+ "<|LOC_778|>": 101075,
769
+ "<|LOC_779|>": 101076,
770
+ "<|LOC_77|>": 100374,
771
+ "<|LOC_780|>": 101077,
772
+ "<|LOC_781|>": 101078,
773
+ "<|LOC_782|>": 101079,
774
+ "<|LOC_783|>": 101080,
775
+ "<|LOC_784|>": 101081,
776
+ "<|LOC_785|>": 101082,
777
+ "<|LOC_786|>": 101083,
778
+ "<|LOC_787|>": 101084,
779
+ "<|LOC_788|>": 101085,
780
+ "<|LOC_789|>": 101086,
781
+ "<|LOC_78|>": 100375,
782
+ "<|LOC_790|>": 101087,
783
+ "<|LOC_791|>": 101088,
784
+ "<|LOC_792|>": 101089,
785
+ "<|LOC_793|>": 101090,
786
+ "<|LOC_794|>": 101091,
787
+ "<|LOC_795|>": 101092,
788
+ "<|LOC_796|>": 101093,
789
+ "<|LOC_797|>": 101094,
790
+ "<|LOC_798|>": 101095,
791
+ "<|LOC_799|>": 101096,
792
+ "<|LOC_79|>": 100376,
793
+ "<|LOC_7|>": 100304,
794
+ "<|LOC_800|>": 101097,
795
+ "<|LOC_801|>": 101098,
796
+ "<|LOC_802|>": 101099,
797
+ "<|LOC_803|>": 101100,
798
+ "<|LOC_804|>": 101101,
799
+ "<|LOC_805|>": 101102,
800
+ "<|LOC_806|>": 101103,
801
+ "<|LOC_807|>": 101104,
802
+ "<|LOC_808|>": 101105,
803
+ "<|LOC_809|>": 101106,
804
+ "<|LOC_80|>": 100377,
805
+ "<|LOC_810|>": 101107,
806
+ "<|LOC_811|>": 101108,
807
+ "<|LOC_812|>": 101109,
808
+ "<|LOC_813|>": 101110,
809
+ "<|LOC_814|>": 101111,
810
+ "<|LOC_815|>": 101112,
811
+ "<|LOC_816|>": 101113,
812
+ "<|LOC_817|>": 101114,
813
+ "<|LOC_818|>": 101115,
814
+ "<|LOC_819|>": 101116,
815
+ "<|LOC_81|>": 100378,
816
+ "<|LOC_820|>": 101117,
817
+ "<|LOC_821|>": 101118,
818
+ "<|LOC_822|>": 101119,
819
+ "<|LOC_823|>": 101120,
820
+ "<|LOC_824|>": 101121,
821
+ "<|LOC_825|>": 101122,
822
+ "<|LOC_826|>": 101123,
823
+ "<|LOC_827|>": 101124,
824
+ "<|LOC_828|>": 101125,
825
+ "<|LOC_829|>": 101126,
826
+ "<|LOC_82|>": 100379,
827
+ "<|LOC_830|>": 101127,
828
+ "<|LOC_831|>": 101128,
829
+ "<|LOC_832|>": 101129,
830
+ "<|LOC_833|>": 101130,
831
+ "<|LOC_834|>": 101131,
832
+ "<|LOC_835|>": 101132,
833
+ "<|LOC_836|>": 101133,
834
+ "<|LOC_837|>": 101134,
835
+ "<|LOC_838|>": 101135,
836
+ "<|LOC_839|>": 101136,
837
+ "<|LOC_83|>": 100380,
838
+ "<|LOC_840|>": 101137,
839
+ "<|LOC_841|>": 101138,
840
+ "<|LOC_842|>": 101139,
841
+ "<|LOC_843|>": 101140,
842
+ "<|LOC_844|>": 101141,
843
+ "<|LOC_845|>": 101142,
844
+ "<|LOC_846|>": 101143,
845
+ "<|LOC_847|>": 101144,
846
+ "<|LOC_848|>": 101145,
847
+ "<|LOC_849|>": 101146,
848
+ "<|LOC_84|>": 100381,
849
+ "<|LOC_850|>": 101147,
850
+ "<|LOC_851|>": 101148,
851
+ "<|LOC_852|>": 101149,
852
+ "<|LOC_853|>": 101150,
853
+ "<|LOC_854|>": 101151,
854
+ "<|LOC_855|>": 101152,
855
+ "<|LOC_856|>": 101153,
856
+ "<|LOC_857|>": 101154,
857
+ "<|LOC_858|>": 101155,
858
+ "<|LOC_859|>": 101156,
859
+ "<|LOC_85|>": 100382,
860
+ "<|LOC_860|>": 101157,
861
+ "<|LOC_861|>": 101158,
862
+ "<|LOC_862|>": 101159,
863
+ "<|LOC_863|>": 101160,
864
+ "<|LOC_864|>": 101161,
865
+ "<|LOC_865|>": 101162,
866
+ "<|LOC_866|>": 101163,
867
+ "<|LOC_867|>": 101164,
868
+ "<|LOC_868|>": 101165,
869
+ "<|LOC_869|>": 101166,
870
+ "<|LOC_86|>": 100383,
871
+ "<|LOC_870|>": 101167,
872
+ "<|LOC_871|>": 101168,
873
+ "<|LOC_872|>": 101169,
874
+ "<|LOC_873|>": 101170,
875
+ "<|LOC_874|>": 101171,
876
+ "<|LOC_875|>": 101172,
877
+ "<|LOC_876|>": 101173,
878
+ "<|LOC_877|>": 101174,
879
+ "<|LOC_878|>": 101175,
880
+ "<|LOC_879|>": 101176,
881
+ "<|LOC_87|>": 100384,
882
+ "<|LOC_880|>": 101177,
883
+ "<|LOC_881|>": 101178,
884
+ "<|LOC_882|>": 101179,
885
+ "<|LOC_883|>": 101180,
886
+ "<|LOC_884|>": 101181,
887
+ "<|LOC_885|>": 101182,
888
+ "<|LOC_886|>": 101183,
889
+ "<|LOC_887|>": 101184,
890
+ "<|LOC_888|>": 101185,
891
+ "<|LOC_889|>": 101186,
892
+ "<|LOC_88|>": 100385,
893
+ "<|LOC_890|>": 101187,
894
+ "<|LOC_891|>": 101188,
895
+ "<|LOC_892|>": 101189,
896
+ "<|LOC_893|>": 101190,
897
+ "<|LOC_894|>": 101191,
898
+ "<|LOC_895|>": 101192,
899
+ "<|LOC_896|>": 101193,
900
+ "<|LOC_897|>": 101194,
901
+ "<|LOC_898|>": 101195,
902
+ "<|LOC_899|>": 101196,
903
+ "<|LOC_89|>": 100386,
904
+ "<|LOC_8|>": 100305,
905
+ "<|LOC_900|>": 101197,
906
+ "<|LOC_901|>": 101198,
907
+ "<|LOC_902|>": 101199,
908
+ "<|LOC_903|>": 101200,
909
+ "<|LOC_904|>": 101201,
910
+ "<|LOC_905|>": 101202,
911
+ "<|LOC_906|>": 101203,
912
+ "<|LOC_907|>": 101204,
913
+ "<|LOC_908|>": 101205,
914
+ "<|LOC_909|>": 101206,
915
+ "<|LOC_90|>": 100387,
916
+ "<|LOC_910|>": 101207,
917
+ "<|LOC_911|>": 101208,
918
+ "<|LOC_912|>": 101209,
919
+ "<|LOC_913|>": 101210,
920
+ "<|LOC_914|>": 101211,
921
+ "<|LOC_915|>": 101212,
922
+ "<|LOC_916|>": 101213,
923
+ "<|LOC_917|>": 101214,
924
+ "<|LOC_918|>": 101215,
925
+ "<|LOC_919|>": 101216,
926
+ "<|LOC_91|>": 100388,
927
+ "<|LOC_920|>": 101217,
928
+ "<|LOC_921|>": 101218,
929
+ "<|LOC_922|>": 101219,
930
+ "<|LOC_923|>": 101220,
931
+ "<|LOC_924|>": 101221,
932
+ "<|LOC_925|>": 101222,
933
+ "<|LOC_926|>": 101223,
934
+ "<|LOC_927|>": 101224,
935
+ "<|LOC_928|>": 101225,
936
+ "<|LOC_929|>": 101226,
937
+ "<|LOC_92|>": 100389,
938
+ "<|LOC_930|>": 101227,
939
+ "<|LOC_931|>": 101228,
940
+ "<|LOC_932|>": 101229,
941
+ "<|LOC_933|>": 101230,
942
+ "<|LOC_934|>": 101231,
943
+ "<|LOC_935|>": 101232,
944
+ "<|LOC_936|>": 101233,
945
+ "<|LOC_937|>": 101234,
946
+ "<|LOC_938|>": 101235,
947
+ "<|LOC_939|>": 101236,
948
+ "<|LOC_93|>": 100390,
949
+ "<|LOC_940|>": 101237,
950
+ "<|LOC_941|>": 101238,
951
+ "<|LOC_942|>": 101239,
952
+ "<|LOC_943|>": 101240,
953
+ "<|LOC_944|>": 101241,
954
+ "<|LOC_945|>": 101242,
955
+ "<|LOC_946|>": 101243,
956
+ "<|LOC_947|>": 101244,
957
+ "<|LOC_948|>": 101245,
958
+ "<|LOC_949|>": 101246,
959
+ "<|LOC_94|>": 100391,
960
+ "<|LOC_950|>": 101247,
961
+ "<|LOC_951|>": 101248,
962
+ "<|LOC_952|>": 101249,
963
+ "<|LOC_953|>": 101250,
964
+ "<|LOC_954|>": 101251,
965
+ "<|LOC_955|>": 101252,
966
+ "<|LOC_956|>": 101253,
967
+ "<|LOC_957|>": 101254,
968
+ "<|LOC_958|>": 101255,
969
+ "<|LOC_959|>": 101256,
970
+ "<|LOC_95|>": 100392,
971
+ "<|LOC_960|>": 101257,
972
+ "<|LOC_961|>": 101258,
973
+ "<|LOC_962|>": 101259,
974
+ "<|LOC_963|>": 101260,
975
+ "<|LOC_964|>": 101261,
976
+ "<|LOC_965|>": 101262,
977
+ "<|LOC_966|>": 101263,
978
+ "<|LOC_967|>": 101264,
979
+ "<|LOC_968|>": 101265,
980
+ "<|LOC_969|>": 101266,
981
+ "<|LOC_96|>": 100393,
982
+ "<|LOC_970|>": 101267,
983
+ "<|LOC_971|>": 101268,
984
+ "<|LOC_972|>": 101269,
985
+ "<|LOC_973|>": 101270,
986
+ "<|LOC_974|>": 101271,
987
+ "<|LOC_975|>": 101272,
988
+ "<|LOC_976|>": 101273,
989
+ "<|LOC_977|>": 101274,
990
+ "<|LOC_978|>": 101275,
991
+ "<|LOC_979|>": 101276,
992
+ "<|LOC_97|>": 100394,
993
+ "<|LOC_980|>": 101277,
994
+ "<|LOC_981|>": 101278,
995
+ "<|LOC_982|>": 101279,
996
+ "<|LOC_983|>": 101280,
997
+ "<|LOC_984|>": 101281,
998
+ "<|LOC_985|>": 101282,
999
+ "<|LOC_986|>": 101283,
1000
+ "<|LOC_987|>": 101284,
1001
+ "<|LOC_988|>": 101285,
1002
+ "<|LOC_989|>": 101286,
1003
+ "<|LOC_98|>": 100395,
1004
+ "<|LOC_990|>": 101287,
1005
+ "<|LOC_991|>": 101288,
1006
+ "<|LOC_992|>": 101289,
1007
+ "<|LOC_993|>": 101290,
1008
+ "<|LOC_994|>": 101291,
1009
+ "<|LOC_995|>": 101292,
1010
+ "<|LOC_996|>": 101293,
1011
+ "<|LOC_997|>": 101294,
1012
+ "<|LOC_998|>": 101295,
1013
+ "<|LOC_999|>": 101296,
1014
+ "<|LOC_99|>": 100396,
1015
+ "<|LOC_9|>": 100306,
1016
+ "<|LOC_BEGIN|>": 101298,
1017
+ "<|LOC_END|>": 101299,
1018
+ "<|LOC_SEP|>": 101300,
1019
+ "<|image_pad|>": 101304,
1020
+ "<|video_pad|>": 101307
1021
+ }
chat_template.jinja ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if not add_generation_prompt is defined -%}
2
+ {%- set add_generation_prompt = true -%}
3
+ {%- endif -%}
4
+ {%- if not cls_token is defined -%}
5
+ {%- set cls_token = "<|begin_of_sentence|>" -%}
6
+ {%- endif -%}
7
+ {%- if not eos_token is defined -%}
8
+ {%- set eos_token = "</s>" -%}
9
+ {%- endif -%}
10
+ {%- if not image_token is defined -%}
11
+ {%- set image_token = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" -%}
12
+ {%- endif -%}
13
+ {{- cls_token -}}
14
+ {%- for message in messages -%}
15
+ {%- if message["role"] == "user" -%}
16
+ {{- "User: " -}}
17
+ {%- for content in message["content"] -%}
18
+ {%- if content["type"] == "image" -%}
19
+ {{ image_token }}
20
+ {%- endif -%}
21
+ {%- endfor -%}
22
+ {%- for content in message["content"] -%}
23
+ {%- if content["type"] == "text" -%}
24
+ {{ content["text"] }}
25
+ {%- endif -%}
26
+ {%- endfor -%}
27
+ {{ "\n" -}}
28
+ {%- elif message["role"] == "assistant" -%}
29
+ {{- "Assistant: " -}}
30
+ {%- for content in message["content"] -%}
31
+ {%- if content["type"] == "text" -%}
32
+ {{ content["text"] }}
33
+ {%- endif -%}
34
+ {%- endfor -%}
35
+ {{ eos_token -}}
36
+ {%- elif message["role"] == "system" -%}
37
+ {%- for content in message["content"] -%}
38
+ {%- if content["type"] == "text" -%}
39
+ {{ content["text"] + "\n" }}
40
+ {%- endif -%}
41
+ {%- endfor -%}
42
+ {%- endif -%}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{- "Assistant: " -}}
46
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "PaddleOCRVLForConditionalGeneration"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
8
+ "AutoModel": "modeling_paddleocr_vl.PaddleOCRVLForConditionalGeneration",
9
+ "AutoModelForCausalLM": "modeling_paddleocr_vl.PaddleOCRVLForConditionalGeneration"
10
+ },
11
+ "compression_ratio": 1.0,
12
+ "head_dim": 128,
13
+ "hidden_act": "silu",
14
+ "hidden_dropout_prob": 0.0,
15
+ "hidden_size": 1024,
16
+ "ignored_index": -100,
17
+ "image_token_id": 100295,
18
+ "intermediate_size": 3072,
19
+ "max_position_embeddings": 131072,
20
+ "max_sequence_length": null,
21
+ "model_type": "paddleocr_vl",
22
+ "num_attention_heads": 16,
23
+ "num_hidden_layers": 18,
24
+ "num_key_value_heads": 2,
25
+ "pad_token_id": 0,
26
+ "rms_norm_eps": 1e-05,
27
+ "rope_scaling": {
28
+ "mrope_section": [
29
+ 16,
30
+ 24,
31
+ 24
32
+ ],
33
+ "rope_type": "default",
34
+ "type": "default"
35
+ },
36
+ "rope_theta": 500000,
37
+ "sliding_window": null,
38
+ "tie_word_embeddings": false,
39
+ "torch_dtype": "bfloat16",
40
+ "transformers_version": "4.55.0",
41
+ "use_bias": false,
42
+ "use_cache": false,
43
+ "use_flash_attention": false,
44
+ "video_token_id": 101307,
45
+ "vision_config": {
46
+ "architectures": [
47
+ "SiglipVisionModel"
48
+ ],
49
+ "attention_dropout": 0.0,
50
+ "auto_map": {
51
+ "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
52
+ "AutoModel": "modeling_paddleocr_vl.SiglipVisionModel"
53
+ },
54
+ "hidden_act": "gelu_pytorch_tanh",
55
+ "hidden_size": 1152,
56
+ "image_size": 384,
57
+ "intermediate_size": 4304,
58
+ "layer_norm_eps": 1e-06,
59
+ "model_type": "paddleocr_vl",
60
+ "num_attention_heads": 16,
61
+ "num_channels": 3,
62
+ "num_hidden_layers": 27,
63
+ "pad_token_id": 0,
64
+ "patch_size": 14,
65
+ "spatial_merge_size": 2,
66
+ "temporal_patch_size": 2,
67
+ "tokens_per_second": 2,
68
+ "torch_dtype": "bfloat16"
69
+ },
70
+ "vision_start_token_id": 101305,
71
+ "vocab_size": 103424,
72
+ "weight_share_add_bias": true,
73
+ "use_3d_rope": true,
74
+ "rope_is_neox_style": true
75
+ }
configuration_paddleocr_vl.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from transformers.configuration_utils import PretrainedConfig
16
+ from transformers.modeling_rope_utils import rope_config_validation
17
+
18
+ class PaddleOCRVisionConfig(PretrainedConfig):
19
+ model_type = "paddleocr_vl"
20
+ base_config_key = "vision_config"
21
+
22
+ def __init__(
23
+ self,
24
+ hidden_size=768,
25
+ intermediate_size=3072,
26
+ num_hidden_layers=12,
27
+ num_attention_heads=12,
28
+ num_channels=3,
29
+ image_size=224,
30
+ patch_size=14,
31
+ hidden_act="gelu_pytorch_tanh",
32
+ layer_norm_eps=1e-6,
33
+ attention_dropout=0.0,
34
+ spatial_merge_size=2,
35
+ temporal_patch_size=2,
36
+ tokens_per_second=2,
37
+ **kwargs,
38
+ ):
39
+ super().__init__(**kwargs)
40
+
41
+ self.hidden_size = hidden_size
42
+ self.intermediate_size = intermediate_size
43
+ self.num_hidden_layers = num_hidden_layers
44
+ self.num_attention_heads = num_attention_heads
45
+ self.num_channels = num_channels
46
+ self.patch_size = patch_size
47
+ self.image_size = image_size
48
+ self.attention_dropout = attention_dropout
49
+ self.layer_norm_eps = layer_norm_eps
50
+ self.hidden_act = hidden_act
51
+ self.spatial_merge_size = spatial_merge_size
52
+ self.temporal_patch_size = temporal_patch_size
53
+ self.tokens_per_second = tokens_per_second
54
+
55
+
56
+
57
+ class PaddleOCRVLConfig(PretrainedConfig):
58
+ """
59
+ Configuration class.
60
+
61
+ This class stores the configuration of an Ernie model, defining the model architecture.
62
+ It inherits from PretrainedConfig and can be used to control model outputs.
63
+ """
64
+
65
+ model_type = "paddleocr_vl"
66
+ keys_to_ignore_at_inference = ["past_key_values"]
67
+ sub_configs = {"vision_config": PaddleOCRVisionConfig}
68
+
69
+ # Default tensor parallel plan for base model `Qwen3`
70
+ base_model_tp_plan = {
71
+ "layers.*.self_attn.q_proj": "colwise",
72
+ "layers.*.self_attn.k_proj": "colwise",
73
+ "layers.*.self_attn.v_proj": "colwise",
74
+ "layers.*.self_attn.o_proj": "rowwise",
75
+ "layers.*.mlp.gate_proj": "colwise",
76
+ "layers.*.mlp.up_proj": "colwise",
77
+ "layers.*.mlp.down_proj": "rowwise",
78
+ }
79
+ base_model_pp_plan = {
80
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
81
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
82
+ "norm": (["hidden_states"], ["hidden_states"]),
83
+ }
84
+
85
+ def __init__(
86
+ self,
87
+ vocab_size=32000,
88
+ hidden_size=768,
89
+ intermediate_size=11008,
90
+ max_position_embeddings=32768,
91
+ num_hidden_layers=2,
92
+ num_attention_heads=2,
93
+ image_token_id=101304,
94
+ video_token_id=101305,
95
+ vision_start_token_id=101306,
96
+ rms_norm_eps=1e-6,
97
+ use_cache=False,
98
+ use_flash_attention=False,
99
+ pad_token_id=0,
100
+ bos_token_id=1,
101
+ eos_token_id=2,
102
+ head_dim=128,
103
+ hidden_act="silu",
104
+ use_bias=False,
105
+ rope_theta=10000,
106
+ weight_share_add_bias=True,
107
+ ignored_index=-100,
108
+ attention_probs_dropout_prob=0.0,
109
+ hidden_dropout_prob=0.0,
110
+ compression_ratio: float = 1.0,
111
+ num_key_value_heads=None,
112
+ max_sequence_length=None,
113
+ tie_word_embeddings=False,
114
+ vision_config=None,
115
+ rope_scaling=None,
116
+ **kwargs,
117
+ ):
118
+ """
119
+ Initialize configuration with default or specified parameters.
120
+
121
+ Args:
122
+ vocab_size (int): Size of the vocabulary (number of unique tokens)
123
+ hidden_size (int): Dimensionality of the encoder layers and the pooler layer
124
+ intermediate_size (int): Dimensionality of the "intermediate" (feed-forward) layer
125
+ max_position_embeddings (int): Maximum sequence length the model can handle
126
+ num_hidden_layers (int): Number of hidden layers in the Transformer encoder
127
+ num_attention_heads (int): Number of attention heads for each attention layer
128
+ rms_norm_eps (float): The epsilon used by the RMS normalization layers
129
+ use_cache (bool): Whether to use caching for faster generation (decoding)
130
+ use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
131
+ pad_token_id (int): Token ID used for padding sequences
132
+ bos_token_id (int): Token ID used for beginning-of-sequence
133
+ eos_token_id (int): Token ID used for end-of-sequence
134
+ use_bias (bool): Whether to use bias terms in linear layers
135
+ rope_theta (float): The base period of the RoPE embeddings
136
+ weight_share_add_bias (bool): Whether to share bias weights in certain layers
137
+ ignored_index (int): Target value that is ignored during loss computation
138
+ attention_probs_dropout_prob (float): Dropout probability for attention weights
139
+ hidden_dropout_prob (float): Dropout probability for hidden layers
140
+ compression_ratio (float): Ratio for KV cache compression (1.0 = no compression)
141
+ num_key_value_heads (int): Number of key/value heads (for Grouped Query Attention)
142
+ max_sequence_length (int): Maximum sequence length for positional embeddings
143
+ **kwargs: Additional keyword arguments passed to parent class
144
+ """
145
+
146
+ # Set default for tied embeddings if not specified.
147
+ super().__init__(
148
+ pad_token_id=pad_token_id,
149
+ bos_token_id=bos_token_id,
150
+ eos_token_id=eos_token_id,
151
+ **kwargs,
152
+ )
153
+ if isinstance(vision_config, dict):
154
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
155
+ elif vision_config is None:
156
+ self.vision_config = self.sub_configs["vision_config"]()
157
+ self.vocab_size = vocab_size
158
+ self.hidden_size = hidden_size
159
+ self.intermediate_size = intermediate_size
160
+ self.max_position_embeddings = max_position_embeddings
161
+ self.num_hidden_layers = num_hidden_layers
162
+ self.num_attention_heads = num_attention_heads
163
+ self.rms_norm_eps = rms_norm_eps
164
+ self.use_cache = use_cache
165
+ self.use_flash_attention = use_flash_attention
166
+ self.pad_token_id = pad_token_id
167
+ self.bos_token_id = bos_token_id
168
+ self.eos_token_id = eos_token_id
169
+ self.image_token_id = image_token_id
170
+ self.video_token_id = video_token_id
171
+ self.vision_start_token_id = vision_start_token_id
172
+ self.head_dim = head_dim
173
+ self.hidden_act=hidden_act
174
+ self.sliding_window = None
175
+ self.hidden_size = hidden_size
176
+ self.use_bias = use_bias
177
+ self.weight_share_add_bias = weight_share_add_bias
178
+ self.rope_theta = rope_theta
179
+ self.ignored_index = ignored_index
180
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
181
+ self.hidden_dropout_prob = hidden_dropout_prob
182
+ self.compression_ratio = compression_ratio
183
+ self.num_key_value_heads = num_key_value_heads
184
+ self.max_sequence_length = max_sequence_length
185
+ self.rope_scaling = rope_scaling
186
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
187
+ if self.rope_scaling["type"] == "mrope":
188
+ self.rope_scaling["type"] = "default"
189
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
190
+ rope_config_validation(self, ignore_keys={"mrope_section"})
191
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 2,
4
+ "pad_token_id": 0,
5
+ "transformers_version": "4.55.0",
6
+ "use_cache": false
7
+ }
image_processing.py ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Image processor class for PaddleOCR-VL."""
16
+
17
+ import math
18
+ from typing import Dict, List, Optional, Union
19
+
20
+ import numpy as np
21
+ import torch
22
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
23
+ from torchvision.transforms import functional as TF
24
+ from transformers.image_transforms import (
25
+ convert_to_rgb,
26
+ resize,
27
+ to_channel_dimension_format,
28
+ )
29
+ from transformers.image_utils import (
30
+ OPENAI_CLIP_MEAN,
31
+ OPENAI_CLIP_STD,
32
+ ChannelDimension,
33
+ PILImageResampling,
34
+ get_image_size,
35
+ infer_channel_dimension_format,
36
+ is_scaled_image,
37
+ is_valid_image,
38
+ make_list_of_images,
39
+ to_numpy_array,
40
+ valid_images,
41
+ validate_preprocess_arguments,
42
+ )
43
+ from transformers.utils import TensorType, is_vision_available, logging
44
+
45
+
46
+ logger = logging.get_logger(__name__)
47
+
48
+
49
+ if is_vision_available():
50
+ from PIL import Image
51
+
52
+ ImageInput = Union[
53
+ "PIL.Image.Image",
54
+ np.ndarray,
55
+ "torch.Tensor",
56
+ List["PIL.Image.Image"],
57
+ List[np.ndarray],
58
+ List["torch.Tensor"],
59
+ ] # noqa
60
+
61
+
62
+ VideoInput = Union[
63
+ List["PIL.Image.Image"],
64
+ "np.ndarray",
65
+ "torch.Tensor",
66
+ List["np.ndarray"],
67
+ List["torch.Tensor"],
68
+ List[List["PIL.Image.Image"]],
69
+ List[List["np.ndarrray"]],
70
+ List[List["torch.Tensor"]],
71
+ ] # noqa
72
+
73
+
74
+ def make_batched_images(images) -> List[List[ImageInput]]:
75
+ """
76
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
77
+
78
+ Args:
79
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
80
+ The input image.
81
+
82
+ Returns:
83
+ list: A list of images.
84
+ """
85
+ if (
86
+ isinstance(images, (list, tuple))
87
+ and isinstance(images[0], (list, tuple))
88
+ and is_valid_image(images[0][0])
89
+ ):
90
+ return [img for img_list in images for img in img_list]
91
+
92
+ elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
93
+ return images
94
+
95
+ elif is_valid_image(images):
96
+ return [images]
97
+
98
+ raise ValueError(f"Could not make batched images from {images}")
99
+
100
+
101
+ def adjust_size(size, patch_size):
102
+ num_patches = size // patch_size
103
+ if num_patches % 2 != 0: # 如果是奇数,减1
104
+ num_patches -= 1
105
+ return num_patches * patch_size
106
+
107
+
108
+ def make_batched_videos(videos) -> List[VideoInput]:
109
+ if (
110
+ isinstance(videos, (list, tuple))
111
+ and isinstance(videos[0], (list, tuple))
112
+ and is_valid_image(videos[0][0])
113
+ ):
114
+ return videos
115
+
116
+ elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
117
+ if isinstance(videos[0], Image.Image):
118
+ return [videos]
119
+ elif len(videos[0].shape) == 4:
120
+ return [list(video) for video in videos]
121
+
122
+ elif is_valid_image(videos) and len(videos.shape) == 4:
123
+ return [list(videos)]
124
+
125
+ raise ValueError(f"Could not make batched video from {videos}")
126
+
127
+
128
+ def smart_resize(
129
+ height: int,
130
+ width: int,
131
+ factor: int = 28,
132
+ min_pixels: int = 28 * 28 * 130,
133
+ max_pixels: int = 28 * 28 * 1280,
134
+ ):
135
+ """Rescales the image so that the following conditions are met:
136
+
137
+ 1. Both dimensions (height and width) are divisible by 'factor'.
138
+
139
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
140
+
141
+ 3. The aspect ratio of the image is maintained as closely as possible.
142
+
143
+ """
144
+ # if height < factor or width < factor:
145
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
146
+ # if int(height < factor//4) + int(width < factor//4):
147
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
148
+
149
+ if height < factor:
150
+ print(f"smart_resize: height={height} < factor={factor}, reset height=factor")
151
+ width = round((width * factor) / height)
152
+ height = factor
153
+
154
+ if width < factor:
155
+ print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
156
+ height = round((height * factor) / width)
157
+ width = factor
158
+
159
+ if max(height, width) / min(height, width) > 200:
160
+ raise ValueError(
161
+ f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
162
+ )
163
+ h_bar = round(height / factor) * factor
164
+ w_bar = round(width / factor) * factor
165
+ if h_bar * w_bar > max_pixels:
166
+ beta = math.sqrt((height * width) / max_pixels)
167
+ h_bar = math.floor(height / beta / factor) * factor
168
+ w_bar = math.floor(width / beta / factor) * factor
169
+ elif h_bar * w_bar < min_pixels:
170
+ beta = math.sqrt(min_pixels / (height * width))
171
+ h_bar = math.ceil(height * beta / factor) * factor
172
+ w_bar = math.ceil(width * beta / factor) * factor
173
+ return h_bar, w_bar
174
+
175
+
176
+ class SiglipImageProcessor(BaseImageProcessor):
177
+ r"""
178
+ Constructs a Siglip image processor that dynamically resizes images based on the original images.
179
+
180
+ Args:
181
+ do_resize (`bool`, *optional*, defaults to `True`):
182
+ Whether to resize the image's (height, width) dimensions.
183
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
184
+ Resampling filter to use when resizing the image.
185
+ do_rescale (`bool`, *optional*, defaults to `True`):
186
+ Whether to rescale the image by the specified scale `rescale_factor`.
187
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
188
+ Scale factor to use if rescaling the image.
189
+ do_normalize (`bool`, *optional*, defaults to `True`):
190
+ Whether to normalize the image.
191
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
192
+ Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
193
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
194
+ Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
195
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
196
+ Whether to convert the image to RGB.
197
+ min_pixels (`int`, *optional*, defaults to `28 * 28 * 130`):
198
+ The min pixels of the image to resize the image.
199
+ max_pixels (`int`, *optional*, defaults to `28 * 28 * 1670`):
200
+ The max pixels of the image to resize the image.
201
+ patch_size (`int`, *optional*, defaults to 14):
202
+ The spacial patch size of the vision encoder.
203
+ temporal_patch_size (`int`, *optional*, defaults to 2):
204
+ The temporal patch size of the vision encoder.
205
+ merge_size (`int`, *optional*, defaults to 2):
206
+ The merge size of the vision encoder to llm encoder.
207
+ """
208
+
209
+ model_input_names = [
210
+ "pixel_values",
211
+ "image_grid_thw",
212
+ "pixel_values_videos",
213
+ "video_grid_thw",
214
+ ]
215
+
216
+ def __init__(
217
+ self,
218
+ do_resize: bool = True,
219
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
220
+ do_rescale: bool = True,
221
+ rescale_factor: Union[int, float] = 1 / 255,
222
+ do_normalize: bool = True,
223
+ image_mean: Optional[Union[float, List[float]]] = None,
224
+ image_std: Optional[Union[float, List[float]]] = None,
225
+ do_convert_rgb: bool = True,
226
+ min_pixels: int = 28 * 28 * 130,
227
+ max_pixels: int = 28 * 28 * 1280,
228
+ patch_size: int = 14,
229
+ temporal_patch_size: int = 1,
230
+ merge_size: int = 2,
231
+ **kwargs,
232
+ ) -> None:
233
+ super().__init__(**kwargs)
234
+ self.do_resize = do_resize
235
+ self.resample = resample
236
+ self.do_rescale = do_rescale
237
+ self.rescale_factor = rescale_factor
238
+ self.do_normalize = do_normalize
239
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
240
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
241
+ self.min_pixels = min_pixels
242
+ self.max_pixels = max_pixels
243
+ self.patch_size = patch_size
244
+ self.temporal_patch_size = temporal_patch_size
245
+ self.merge_size = merge_size
246
+ self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used
247
+ self.do_convert_rgb = do_convert_rgb
248
+
249
+ def mvit_rescale(self, image: Image.Image, merge_size: int = 2) -> Image.Image:
250
+ try:
251
+ w, h = image.size
252
+ except:
253
+ raise ValueError(str((type(image), image)))
254
+ patch_size = self.patch_size
255
+
256
+ if (w // patch_size) * (h // patch_size) > self.in_token_limit:
257
+ scale = math.sqrt(
258
+ self.in_token_limit / ((w // patch_size) * (h // patch_size))
259
+ )
260
+ new_w, new_h = int(w * scale), int(h * scale)
261
+
262
+ image = image.resize((new_w, new_h), Image.Resampling.BICUBIC)
263
+ if self.pad_input:
264
+ new_w, new_h = image.size
265
+ pad_size_h = merge_size * patch_size
266
+ pad_size_w = merge_size * patch_size
267
+
268
+ pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
269
+ pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w
270
+
271
+ image = TF.pad(image, (0, 0, pad_w, pad_h))
272
+ else:
273
+ new_w, new_h = image.size
274
+ new_w = new_w - new_w % patch_size
275
+ new_h = new_h - new_h % patch_size
276
+
277
+ new_w = adjust_size(new_w, patch_size)
278
+ new_h = adjust_size(new_h, patch_size)
279
+
280
+ image = TF.center_crop(image, (new_h, new_w))
281
+
282
+ w, h = image.size
283
+ if w // patch_size >= 512 or h // patch_size >= 512:
284
+ new_h = min(patch_size * 510, h)
285
+ new_w = min(patch_size * 510, w)
286
+ image = TF.center_crop(image, (new_h, new_w))
287
+ # raise ValueError("Exceed pos emb")
288
+ return image
289
+
290
+ def _preprocess(
291
+ self,
292
+ images: Union[ImageInput, VideoInput],
293
+ do_resize: bool = None,
294
+ resample: PILImageResampling = None,
295
+ do_rescale: bool = None,
296
+ rescale_factor: float = None,
297
+ do_normalize: bool = None,
298
+ image_mean: Optional[Union[float, List[float]]] = None,
299
+ image_std: Optional[Union[float, List[float]]] = None,
300
+ do_convert_rgb: bool = None,
301
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
302
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
303
+ ):
304
+ """
305
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
306
+
307
+ Args:
308
+ images (`ImageInput`):
309
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
310
+ vision_info (`List[Dict]`, *optional*):
311
+ Optional list of dictionaries containing additional information about vision inputs.
312
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
313
+ Whether to resize the image.
314
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
315
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
316
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
317
+ Whether to rescale the image.
318
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
319
+ Scale factor to use if rescaling the image.
320
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
321
+ Whether to normalize the image.
322
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
323
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
324
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
325
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
326
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
327
+ Whether to convert the image to RGB.
328
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
329
+ The channel dimension format for the output image. Can be one of:
330
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
331
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
332
+ - Unset: Use the channel dimension format of the input image.
333
+ input_data_format (`ChannelDimension` or `str`, *optional*):
334
+ The channel dimension format for the input image. Can be one of:
335
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
336
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
337
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
338
+ """
339
+ images = make_list_of_images(images)
340
+
341
+ if do_convert_rgb:
342
+ images = [convert_to_rgb(image) for image in images]
343
+
344
+ # All transformations expect numpy arrays.
345
+ images = [to_numpy_array(image) for image in images]
346
+
347
+ if is_scaled_image(images[0]) and do_rescale:
348
+ logger.warning_once(
349
+ "It looks like you are trying to rescale already rescaled images. If the input"
350
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
351
+ )
352
+ if input_data_format is None:
353
+ # We assume that all images have the same channel dimension format.
354
+ input_data_format = infer_channel_dimension_format(images[0])
355
+
356
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
357
+ resized_height, resized_width = height, width
358
+ processed_images = []
359
+
360
+ for image in images:
361
+ if do_resize:
362
+ resized_height, resized_width = smart_resize(
363
+ height,
364
+ width,
365
+ factor=self.patch_size * self.merge_size,
366
+ min_pixels=self.min_pixels,
367
+ max_pixels=self.max_pixels,
368
+ )
369
+ image = resize(
370
+ image,
371
+ size=(resized_height, resized_width),
372
+ resample=resample,
373
+ input_data_format=input_data_format,
374
+ )
375
+
376
+ if do_rescale:
377
+ image = self.rescale(
378
+ image, scale=rescale_factor, input_data_format=input_data_format
379
+ )
380
+
381
+ if do_normalize:
382
+ image = self.normalize(
383
+ image=image,
384
+ mean=image_mean,
385
+ std=image_std,
386
+ input_data_format=input_data_format,
387
+ )
388
+ image = to_channel_dimension_format(
389
+ image, data_format, input_channel_dim=input_data_format
390
+ )
391
+ processed_images.append(image)
392
+
393
+ patches = np.array(processed_images)
394
+ if data_format == ChannelDimension.LAST:
395
+ patches = patches.transpose(0, 3, 1, 2)
396
+ if patches.shape[0] == 1:
397
+ patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
398
+ init_patches = patches
399
+ channel = patches.shape[1]
400
+ grid_t = patches.shape[0] // self.temporal_patch_size
401
+ grid_h, grid_w = (
402
+ resized_height // self.patch_size,
403
+ resized_width // self.patch_size,
404
+ )
405
+ patches = patches.reshape(
406
+ grid_t,
407
+ self.temporal_patch_size,
408
+ channel,
409
+ grid_h,
410
+ self.patch_size,
411
+ grid_w,
412
+ self.patch_size,
413
+ )
414
+ patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
415
+ assert self.temporal_patch_size == 1
416
+ flatten_patches = patches.reshape(
417
+ grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size
418
+ )
419
+ return flatten_patches, (grid_t, grid_h, grid_w)
420
+
421
+ def preprocess(
422
+ self,
423
+ images: ImageInput,
424
+ videos: VideoInput = None,
425
+ do_resize: bool = None,
426
+ size: Dict[str, int] = None,
427
+ resample: PILImageResampling = None,
428
+ do_rescale: bool = None,
429
+ rescale_factor: float = None,
430
+ do_normalize: bool = None,
431
+ image_mean: Optional[Union[float, List[float]]] = None,
432
+ image_std: Optional[Union[float, List[float]]] = None,
433
+ do_convert_rgb: bool = None,
434
+ return_tensors: Optional[Union[str, TensorType]] = None,
435
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
436
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
437
+ ):
438
+ """
439
+ Args:
440
+ images (`ImageInput`):
441
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
442
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
443
+ videos (`VideoInput`):
444
+ Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
445
+ passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
446
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
447
+ Whether to resize the image.
448
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
449
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
450
+ the longest edge resized to keep the input aspect ratio.
451
+ resample (`int`, *optional*, defaults to `self.resample`):
452
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
453
+ has an effect if `do_resize` is set to `True`.
454
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
455
+ Whether to rescale the image.
456
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
457
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
458
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
459
+ Whether to normalize the image.
460
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
461
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
462
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
463
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
464
+ `True`.
465
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
466
+ Whether to convert the image to RGB.
467
+ return_tensors (`str` or `TensorType`, *optional*):
468
+ The type of tensors to return. Can be one of:
469
+ - Unset: Return a list of `np.ndarray`.
470
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
471
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
472
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
473
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
474
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
475
+ The channel dimension format for the output image. Can be one of:
476
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
477
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
478
+ - Unset: Use the channel dimension format of the input image.
479
+ input_data_format (`ChannelDimension` or `str`, *optional*):
480
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
481
+ from the input image. Can be one of:
482
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
483
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
484
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
485
+
486
+ """
487
+ do_resize = do_resize if do_resize is not None else self.do_resize
488
+ size = size if size is not None else self.size
489
+ resample = resample if resample is not None else self.resample
490
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
491
+ rescale_factor = (
492
+ rescale_factor if rescale_factor is not None else self.rescale_factor
493
+ )
494
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
495
+ image_mean = image_mean if image_mean is not None else self.image_mean
496
+ image_std = image_std if image_std is not None else self.image_std
497
+ do_convert_rgb = (
498
+ do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
499
+ )
500
+
501
+ if images is not None:
502
+ images = make_batched_images(images)
503
+ if videos is not None:
504
+ videos = make_batched_videos(videos)
505
+
506
+ if images is not None and not valid_images(images):
507
+ raise ValueError(
508
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
509
+ "torch.Tensor, tf.Tensor or jax.ndarray."
510
+ )
511
+
512
+ validate_preprocess_arguments(
513
+ rescale_factor=rescale_factor,
514
+ do_normalize=do_normalize,
515
+ image_mean=image_mean,
516
+ image_std=image_std,
517
+ do_resize=do_resize,
518
+ size=size,
519
+ resample=resample,
520
+ )
521
+
522
+ if images is not None:
523
+ pixel_values, vision_grid_thws = [], []
524
+ for image in images:
525
+ patches, image_grid_thw = self._preprocess(
526
+ image,
527
+ do_resize=do_resize,
528
+ resample=resample,
529
+ do_rescale=do_rescale,
530
+ rescale_factor=rescale_factor,
531
+ do_normalize=do_normalize,
532
+ image_mean=image_mean,
533
+ image_std=image_std,
534
+ data_format=data_format,
535
+ do_convert_rgb=do_convert_rgb,
536
+ input_data_format=input_data_format,
537
+ )
538
+ pixel_values.extend(patches)
539
+ vision_grid_thws.append(image_grid_thw)
540
+ pixel_values = np.array(pixel_values)
541
+ vision_grid_thws = np.array(vision_grid_thws)
542
+ data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
543
+
544
+ if videos is not None:
545
+ pixel_values, vision_grid_thws = [], []
546
+ for images in videos:
547
+ patches, video_grid_thw = self._preprocess(
548
+ images,
549
+ do_resize=do_resize,
550
+ resample=resample,
551
+ do_rescale=do_rescale,
552
+ rescale_factor=rescale_factor,
553
+ do_normalize=do_normalize,
554
+ image_mean=image_mean,
555
+ image_std=image_std,
556
+ data_format=data_format,
557
+ do_convert_rgb=do_convert_rgb,
558
+ input_data_format=input_data_format,
559
+ )
560
+ pixel_values.extend(patches)
561
+ vision_grid_thws.append(video_grid_thw)
562
+ pixel_values = np.array(pixel_values)
563
+ vision_grid_thws = np.array(vision_grid_thws)
564
+ data = {
565
+ "pixel_values_videos": pixel_values,
566
+ "video_grid_thw": vision_grid_thws,
567
+ }
568
+
569
+ return BatchFeature(data=data, tensor_type=return_tensors)
inference.yml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Global:
2
+ model_name: PaddleOCR-VL-0.9B
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3085f1042e184f68f8a412aa0f64f2c4b8562989598bbfba326aaa11fc685de8
3
+ size 1917255968
modeling_paddleocr_vl.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing.SiglipImageProcessor",
4
+ "AutoProcessor": "processing_paddleocr_vl.PaddleOCRVLProcessor"
5
+ },
6
+ "do_convert_rgb": true,
7
+ "do_normalize": true,
8
+ "do_rescale": true,
9
+ "do_resize": true,
10
+ "image_mean": [
11
+ 0.5,
12
+ 0.5,
13
+ 0.5
14
+ ],
15
+ "image_processor_type": "SiglipImageProcessor",
16
+ "image_std": [
17
+ 0.5,
18
+ 0.5,
19
+ 0.5
20
+ ],
21
+ "max_pixels": 2822400,
22
+ "merge_size": 2,
23
+ "min_pixels": 147384,
24
+ "patch_size": 14,
25
+ "processor_class": "PaddleOCRVLProcessor",
26
+ "resample": 3,
27
+ "rescale_factor": 0.00392156862745098,
28
+ "size": {
29
+ "max_pixels": 2822400,
30
+ "min_pixels": 147384
31
+ },
32
+ "temporal_patch_size": 1
33
+ }
processing_paddleocr_vl.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List, Union
16
+ import numpy as np
17
+ import torch
18
+ from transformers.feature_extraction_utils import BatchFeature
19
+ from transformers.processing_utils import (
20
+ ProcessingKwargs,
21
+ ProcessorMixin,
22
+ Unpack,
23
+ VideosKwargs,
24
+ )
25
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
26
+
27
+
28
+ ImageInput = Union[
29
+ "PIL.Image.Image",
30
+ np.ndarray,
31
+ "torch.Tensor",
32
+ List["PIL.Image.Image"],
33
+ List[np.ndarray],
34
+ List["torch.Tensor"],
35
+ ] # noqa
36
+
37
+
38
+ VideoInput = Union[
39
+ List["PIL.Image.Image"],
40
+ "np.ndarray",
41
+ "torch.Tensor",
42
+ List["np.ndarray"],
43
+ List["torch.Tensor"],
44
+ List[List["PIL.Image.Image"]],
45
+ List[List["np.ndarrray"]],
46
+ List[List["torch.Tensor"]],
47
+ ] # noqa
48
+
49
+
50
+ class PaddleOCRVLVideosProcessorKwargs(VideosKwargs, total=False):
51
+ fps: Union[List[float], float]
52
+
53
+
54
+ class PaddleOCRVLProcessorKwargs(ProcessingKwargs, total=False):
55
+ videos_kwargs: PaddleOCRVLVideosProcessorKwargs
56
+ _defaults = {
57
+ "text_kwargs": {
58
+ "padding": False,
59
+ },
60
+ "videos_kwargs": {"fps": 2.0},
61
+ }
62
+
63
+
64
+ class PaddleOCRVLProcessor(ProcessorMixin):
65
+ r"""
66
+ [`PaddleOCRVLProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`Qwen2TokenizerFast`]. See the
67
+ [`~PaddleOCRVLProcessor.__call__`] and [`~PaddleOCRVLProcessor.decode`] for more information.
68
+ Args:
69
+ image_processor ([`SiglipImageProcessor`], *optional*):
70
+ The image processor is a required input.
71
+ tokenizer ([`Qwen2TokenizerFast`], *optional*):
72
+ The tokenizer is a required input.
73
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
74
+ in a chat into a tokenizable string.
75
+ """
76
+
77
+ attributes = ["image_processor", "tokenizer"]
78
+ valid_kwargs = [
79
+ "chat_template",
80
+ "image_std",
81
+ "min_pixels",
82
+ "image_mean",
83
+ "merge_size",
84
+ "image_processor_type",
85
+ "temporal_patch_size",
86
+ "patch_size",
87
+ "max_pixels",
88
+ ]
89
+
90
+ image_processor_class = "AutoImageProcessor"
91
+ tokenizer_class = "AutoTokenizer"
92
+
93
+ def __init__(
94
+ self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
95
+ ):
96
+ self.image_token = (
97
+ "<|IMAGE_PLACEHOLDER|>"
98
+ if not hasattr(tokenizer, "image_token")
99
+ else tokenizer.image_token
100
+ )
101
+ self.video_token = (
102
+ "<|video_pad|>"
103
+ if not hasattr(tokenizer, "video_token")
104
+ else tokenizer.video_token
105
+ )
106
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
107
+
108
+ def __call__(
109
+ self,
110
+ images: ImageInput = None,
111
+ text: Union[
112
+ TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
113
+ ] = None,
114
+ videos: VideoInput = None,
115
+ **kwargs: Unpack[PaddleOCRVLProcessorKwargs],
116
+ ) -> BatchFeature:
117
+ """
118
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
119
+ and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
120
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
121
+ SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `vision_infos` is not `None`.
122
+
123
+ Args:
124
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
125
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
126
+ tensor. Both channels-first and channels-last formats are supported.
127
+ text (`str`, `List[str]`, `List[List[str]]`):
128
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
129
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
130
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
131
+ videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
132
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
133
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
134
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
135
+ If set, will return tensors of a particular framework. Acceptable values are:
136
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
137
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
138
+ - `'np'`: Return NumPy `np.ndarray` objects.
139
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
140
+
141
+ Returns:
142
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
143
+
144
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
145
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
146
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
147
+ `None`).
148
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
149
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
150
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
151
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
152
+ - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
153
+ """
154
+ output_kwargs = self._merge_kwargs(
155
+ PaddleOCRVLProcessorKwargs,
156
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
157
+ **kwargs,
158
+ )
159
+
160
+ if images is not None:
161
+ image_inputs = self.image_processor(images=images, return_tensors="pt")
162
+ image_inputs["pixel_values"] = image_inputs["pixel_values"]
163
+ image_grid_thw = image_inputs["image_grid_thw"]
164
+
165
+ else:
166
+ image_inputs = {}
167
+ image_grid_thw = None
168
+
169
+ if videos is not None:
170
+ # TODO: add video processing
171
+ videos_inputs = self.image_processor(
172
+ images=None, videos=videos, **output_kwargs["images_kwargs"]
173
+ )
174
+ video_grid_thw = videos_inputs["video_grid_thw"]
175
+
176
+ fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
177
+ if isinstance(fps, (int, float)):
178
+ second_per_grid_ts = [
179
+ self.image_processor.temporal_patch_size / fps
180
+ ] * len(video_grid_thw)
181
+ elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
182
+ second_per_grid_ts = [
183
+ self.image_processor.temporal_patch_size / tmp for tmp in fps
184
+ ]
185
+ else:
186
+ raise ValueError(
187
+ f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
188
+ )
189
+ videos_inputs.update(
190
+ {"second_per_grid_ts": torch.tensor(second_per_grid_ts)}
191
+ )
192
+
193
+ else:
194
+ videos_inputs = {}
195
+ video_grid_thw = None
196
+
197
+ if not isinstance(text, list):
198
+ text = [text]
199
+
200
+ if image_grid_thw is not None:
201
+ index = 0
202
+ for i in range(len(text)):
203
+ while self.image_token in text[i]:
204
+ text[i] = text[i].replace(
205
+ self.image_token,
206
+ "<|placeholder|>"
207
+ * (
208
+ image_grid_thw[index].prod()
209
+ // self.image_processor.merge_size
210
+ // self.image_processor.merge_size
211
+ ),
212
+ 1,
213
+ )
214
+ index += 1
215
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
216
+
217
+ if video_grid_thw is not None:
218
+ index = 0
219
+ for i in range(len(text)):
220
+ while self.video_token in text[i]:
221
+ text[i] = text[i].replace(
222
+ self.video_token,
223
+ "<|placeholder|>"
224
+ * (
225
+ video_grid_thw[index].prod()
226
+ // self.image_processor.merge_size
227
+ // self.image_processor.merge_size
228
+ ),
229
+ 1,
230
+ )
231
+ index += 1
232
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
233
+
234
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
235
+
236
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
237
+
238
+ def batch_decode(self, *args, **kwargs):
239
+ """
240
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
241
+ refer to the docstring of this method for more information.
242
+ """
243
+ return self.tokenizer.batch_decode(*args, **kwargs)
244
+
245
+ def decode(self, *args, **kwargs):
246
+ """
247
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
248
+ the docstring of this method for more information.
249
+ """
250
+ return self.tokenizer.decode(*args, **kwargs)
251
+
252
+ def post_process_image_text_to_text(
253
+ self,
254
+ generated_outputs,
255
+ skip_special_tokens=True,
256
+ clean_up_tokenization_spaces=False,
257
+ **kwargs,
258
+ ):
259
+ """
260
+ Post-process the output of the model to decode the text.
261
+
262
+ Args:
263
+ generated_outputs (`torch.Tensor` or `np.ndarray`):
264
+ The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
265
+ or `(sequence_length,)`.
266
+ skip_special_tokens (`bool`, *optional*, defaults to `True`):
267
+ Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
268
+ Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
269
+ Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
270
+ **kwargs:
271
+ Additional arguments to be passed to the tokenizer's `batch_decode method`.
272
+
273
+ Returns:
274
+ `List[str]`: The decoded text.
275
+ """
276
+ return self.tokenizer.batch_decode(
277
+ generated_outputs,
278
+ skip_special_tokens=skip_special_tokens,
279
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
280
+ **kwargs,
281
+ )
282
+
283
+ @property
284
+ def model_input_names(self):
285
+ tokenizer_input_names = self.tokenizer.model_input_names
286
+ image_processor_input_names = self.image_processor.model_input_names
287
+ names_from_processor = list(
288
+ dict.fromkeys(tokenizer_input_names + image_processor_input_names)
289
+ )
290
+ return names_from_processor + ["second_per_grid_ts"]
291
+
292
+
293
+ __all__ = ["PaddleOCRVLProcessor", "PaddleOCRVLProcessor"]
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_paddleocr_vl.PaddleOCRVLProcessor"
4
+ },
5
+ "processor_class": "PaddleOCRVLProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|IMAGE_PLACEHOLDER|>",
4
+ "<|image_pad|>",
5
+ "<|IMAGE_START|>",
6
+ "<|IMAGE_END|>",
7
+ "<|video_pad|>"
8
+ ],
9
+ "bos_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "cls_token": {
17
+ "content": "<|begin_of_sentence|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "eos_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "mask_token": {
31
+ "content": "<mask:1>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "pad_token": {
38
+ "content": "<unk>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "sep_token": {
45
+ "content": "<|end_of_sentence|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ },
51
+ "unk_token": {
52
+ "content": "<unk>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false
57
+ }
58
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f90f04fd8e5eb6dfa380f37d10c87392de8438dccb6768a2486b5a96ee76dba6
3
+ size 11187679
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34ef7db83df785924fb83d7b887b6e822a031c56e15cff40aaf9b982988180df
3
+ size 1614363
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff