fantos

Tingquan commited on 8 days ago

Commit

b1902a6

verified ·

0 Parent(s):

Duplicate from PaddlePaddle/PaddleOCR-VL

Browse files

Co-authored-by: Tingquan Gao <[email protected]>

Files changed (23) hide show

.gitattributes +37 -0
LICENSE +201 -0
PP-DocLayoutV2/config.json +176 -0
PP-DocLayoutV2/inference.pdiparams +3 -0
PP-DocLayoutV2/inference.pdmodel +3 -0
PP-DocLayoutV2/inference.yml +100 -0
README.md +349 -0
added_tokens.json +1021 -0
chat_template.jinja +46 -0
config.json +75 -0
configuration_paddleocr_vl.py +191 -0
generation_config.json +7 -0
image_processing.py +569 -0
inference.yml +2 -0
model.safetensors +3 -0
modeling_paddleocr_vl.py +0 -0
preprocessor_config.json +33 -0
processing_paddleocr_vl.py +293 -0
processor_config.json +6 -0
special_tokens_map.json +58 -0
tokenizer.json +3 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+PP-DocLayoutV2/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
+PP-DocLayoutV2/inference.pdmodel filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

PP-DocLayoutV2/config.json ADDED Viewed

	@@ -0,0 +1,176 @@

+{
+    "mode": "paddle",
+    "draw_threshold": 0.5,
+    "metric": "COCO",
+    "use_dynamic_shape": false,
+    "Global": {
+        "model_name": "PP-DocLayoutV2"
+    },
+    "arch": "DETR",
+    "min_subgraph_size": 3,
+    "Preprocess": [
+        {
+            "interp": 2,
+            "keep_ratio": false,
+            "target_size": [
+                800,
+                800
+            ],
+            "type": "Resize"
+        },
+        {
+            "mean": [
+                0.0,
+                0.0,
+                0.0
+            ],
+            "norm_type": "none",
+            "std": [
+                1.0,
+                1.0,
+                1.0
+            ],
+            "type": "NormalizeImage"
+        },
+        {
+            "type": "Permute"
+        }
+    ],
+    "label_list": [
+        "abstract",
+        "algorithm",
+        "aside_text",
+        "chart",
+        "content",
+        "display_formula",
+        "doc_title",
+        "figure_title",
+        "footer",
+        "footer_image",
+        "footnote",
+        "formula_number",
+        "header",
+        "header_image",
+        "image",
+        "inline_formula",
+        "number",
+        "paragraph_title",
+        "reference",
+        "reference_content",
+        "seal",
+        "table",
+        "text",
+        "vertical_text",
+        "vision_footnote"
+    ],
+    "Hpi": {
+        "backend_configs": {
+            "paddle_infer": {
+                "trt_dynamic_shapes": {
+                    "image": [
+                        [
+                            1,
+                            3,
+                            800,
+                            800
+                        ],
+                        [
+                            1,
+                            3,
+                            800,
+                            800
+                        ],
+                        [
+                            8,
+                            3,
+                            800,
+                            800
+                        ]
+                    ],
+                    "scale_factor": [
+                        [
+                            1,
+                            2
+                        ],
+                        [
+                            1,
+                            2
+                        ],
+                        [
+                            8,
+                            2
+                        ]
+                    ]
+                },
+                "trt_dynamic_shape_input_data": {
+                    "scale_factor": [
+                        [
+                            2,
+                            2
+                        ],
+                        [
+                            1,
+                            1
+                        ],
+                        [
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67,
+                            0.67
+                        ]
+                    ]
+                }
+            },
+            "tensorrt": {
+                "dynamic_shapes": {
+                    "image": [
+                        [
+                            1,
+                            3,
+                            800,
+                            800
+                        ],
+                        [
+                            1,
+                            3,
+                            800,
+                            800
+                        ],
+                        [
+                            8,
+                            3,
+                            800,
+                            800
+                        ]
+                    ],
+                    "scale_factor": [
+                        [
+                            1,
+                            2
+                        ],
+                        [
+                            1,
+                            2
+                        ],
+                        [
+                            8,
+                            2
+                        ]
+                    ]
+                }
+            }
+        }
+    }
+}

PP-DocLayoutV2/inference.pdiparams ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45404a84c9fdf91d7bbc94bd47ac4c03649bda84167de04c62bff4726657869a
+size 212170944

PP-DocLayoutV2/inference.pdmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fddd4b4359b95e6f1dd86c86f05e9516517bead9089287681838cdcbf003563b
+size 1515181

PP-DocLayoutV2/inference.yml ADDED Viewed

	@@ -0,0 +1,100 @@

+mode: paddle
+draw_threshold: 0.5
+metric: COCO
+use_dynamic_shape: false
+Global:
+  model_name: PP-DocLayoutV2
+arch: DETR
+min_subgraph_size: 3
+Preprocess:
+- interp: 2
+  keep_ratio: false
+  target_size:
+  - 800
+  - 800
+  type: Resize
+- mean:
+  - 0.0
+  - 0.0
+  - 0.0
+  norm_type: none
+  std:
+  - 1.0
+  - 1.0
+  - 1.0
+  type: NormalizeImage
+- type: Permute
+label_list:
+- abstract
+- algorithm
+- aside_text
+- chart
+- content
+- display_formula
+- doc_title
+- figure_title
+- footer
+- footer_image
+- footnote
+- formula_number
+- header
+- header_image
+- image
+- inline_formula
+- number
+- paragraph_title
+- reference
+- reference_content
+- seal
+- table
+- text
+- vertical_text
+- vision_footnote
+Hpi:
+  backend_configs:
+    paddle_infer:
+      trt_dynamic_shapes: &id001
+        image:
+        - - 1
+          - 3
+          - 800
+          - 800
+        - - 1
+          - 3
+          - 800
+          - 800
+        - - 8
+          - 3
+          - 800
+          - 800
+        scale_factor:
+        - - 1
+          - 2
+        - - 1
+          - 2
+        - - 8
+          - 2
+      trt_dynamic_shape_input_data:
+        scale_factor:
+        - - 2
+          - 2
+        - - 1
+          - 1
+        - - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+          - 0.67
+    tensorrt:
+      dynamic_shapes: *id001

README.md ADDED Viewed

	@@ -0,0 +1,349 @@

+---
+license: apache-2.0
+pipeline_tag: image-text-to-text
+tags:
+- ERNIE4.5
+- PaddleOCR
+- PaddlePaddle
+- image-to-text
+- ocr
+- document-parse
+- layout
+- table
+- formula
+- chart
+base_model: baidu/ERNIE-4.5-0.3B-Paddle
+language:
+- en
+- zh
+- multilingual
+library_name: PaddleOCR
+---
+<div align="center">
+<h1 align="center">
+PaddleOCR-VL: Boosting Multilingual Document Parsing via a 0.9B Ultra-Compact Vision-Language Model
+</h1>
+[![repo](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
+[![HuggingFace](https://img.shields.io/badge/HuggingFace-black.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAF8AAABYCAMAAACkl9t/AAAAk1BMVEVHcEz/nQv/nQv/nQr/nQv/nQr/nQv/nQv/nQr/wRf/txT/pg7/yRr/rBD/zRz/ngv/oAz/zhz/nwv/txT/ngv/0B3+zBz/nQv/0h7/wxn/vRb/thXkuiT/rxH/pxD/ogzcqyf/nQvTlSz/czCxky7/SjifdjT/Mj3+Mj3wMj15aTnDNz+DSD9RTUBsP0FRO0Q6O0WyIxEIAAAAGHRSTlMADB8zSWF3krDDw8TJ1NbX5efv8ff9/fxKDJ9uAAAGKklEQVR42u2Z63qjOAyGC4RwCOfB2JAGqrSb2WnTw/1f3UaWcSGYNKTdf/P+mOkTrE+yJBulvfvLT2A5ruenaVHyIks33npl/6C4s/ZLAM45SOi/1FtZPyFur1OYofBX3w7d54Bxm+E8db+nDr12ttmESZ4zludJEG5S7TO72YPlKZFyE+YCYUJTBZsMiNS5Sd7NlDmKM2Eg2JQg8awbglfqgbhArjxkS7dgp2RH6hc9AMLdZYUtZN5DJr4molC8BfKrEkPKEnEVjLbgW1fLy77ZVOJagoIcLIl+IxaQZGjiX597HopF5CkaXVMDO9Pyix3AFV3kw4lQLCbHuMovz8FallbcQIJ5Ta0vks9RnolbCK84BtjKRS5uA43hYoZcOBGIG2Epbv6CvFVQ8m8loh66WNySsnN7htL58LNp+NXT8/PhXiBXPMjLSxtwp8W9f/1AngRierBkA+kk/IpUSOeKByzn8y3kAAAfh//0oXgV4roHm/kz4E2z//zRc3/lgwBzbM2mJxQEa5pqgX7d1L0htrhx7LKxOZlKbwcAWyEOWqYSI8YPtgDQVjpB5nvaHaSnBaQSD6hweDi8PosxD6/PT09YY3xQA7LTCTKfYX+QHpA0GCcqmEHvr/cyfKQTEuwgbs2kPxJEB0iNjfJcCTPyocx+A0griHSmADiC91oNGVwJ69RudYe65vJmoqfpul0lrqXadW0jFKH5BKwAeCq+Den7s+3zfRJzA61/Uj/9H/VzLKTx9jFPPdXeeP+L7WEvDLAKAIoF8bPTKT0+TM7W8ePj3Rz/Yn3kOAp2f1Kf0Weony7pn/cPydvhQYV+eFOfmOu7VB/ViPe34/EN3RFHY/yRuT8ddCtMPH/McBAT5s+vRde/gf2c/sPsjLK+m5IBQF5tO+h2tTlBGnP6693JdsvofjOPnnEHkh2TnV/X1fBl9S5zrwuwF8NFrAVJVwCAPTe8gaJlomqlp0pv4Pjn98tJ/t/fL++6unpR1YGC2n/KCoa0tTLoKiEeUPDl94nj+5/Tv3/eT5vBQ60X1S0oZr+IWRR8Ldhu7AlLjPISlJcO9vrFotky9SpzDequlwEir5beYAc0R7D9KS1DXva0jhYRDXoExPdc6yw5GShkZXe9QdO/uOvHofxjrV/TNS6iMJS+4TcSTgk9n5agJdBQbB//IfF/HpvPt3Tbi7b6I6K0R72p6ajryEJrENW2bbeVUGjfgoals4L443c7BEE4mJO2SpbRngxQrAKRudRzGQ8jVOL2qDVjjI8K1gc3TIJ5KiFZ1q+gdsARPB4NQS4AjwVSt72DSoXNyOWUrU5mQ9nRYyjp89Xo7oRI6Bga9QNT1mQ/ptaJq5T/7WcgAZywR/XlPGAUDdet3LE+qS0TI+g+aJU8MIqjo0Kx8Ly+maxLjJmjQ18rA0YCkxLQbUZP1WqdmyQGJLUm7VnQFqodmXSqmRrdVpqdzk5LvmvgtEcW8PMGdaS23EOWyDVbACZzUJPaqMbjDxpA3Qrgl0AikimGDbqmyT8P8NOYiqrldF8rX+YN7TopX4UoHuSCYY7cgX4gHwclQKl1zhx0THf+tCAUValzjI7Wg9EhptrkIcfIJjA94evOn8B2eHaVzvBrnl2ig0So6hvPaz0IGcOvTHvUIlE2+prqAxLSQxZlU2stql1NqCCLdIiIN/i1DBEHUoElM9dBravbiAnKqgpi4IBkw+utSPIoBijDXJipSVV7MpOEJUAc5Qmm3BnUN+w3hteEieYKfRZSIUcXKMVf0u5wD4EwsUNVvZOtUT7A2GkffHjByWpHqvRBYrTV72a6j8zZ6W0DTE86Hn04bmyWX3Ri9WH7ZU6Q7h+ZHo0nHUAcsQvVhXRDZHChwiyi/hnPuOsSEF6Exk3o6Y9DT1eZ+6cASXk2Y9k+6EOQMDGm6WBK10wOQJCBwren86cPPWUcRAnTVjGcU1LBgs9FURiX/e6479yZcLwCBmTxiawEwrOcleuu12t3tbLv/N4RLYIBhYexm7Fcn4OJcn0+zc+s8/VfPeddZHAGN6TT8eGczHdR/Gts1/MzDkThr23zqrVfAMFT33Nx1RJsx1k5zuWILLnG/vsH+Fv5D4NTVcp1Gzo8AAAAAElFTkSuQmCC&labelColor=white)](https://huggingface.co/PaddlePaddle/PaddleOCR-VL)
+[![ModelScope](https://img.shields.io/badge/ModelScope-black?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjIzIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCiA8Zz4KICA8dGl0bGU+TGF5ZXIgMTwvdGl0bGU+CiAgPHBhdGggaWQ9InN2Z18xNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTAsODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTUiIGZpbGw9IiM2MjRhZmYiIGQ9Im05OS4xNCwxMTUuNDlsMjUuNjUsMGwwLDI1LjY1bC0yNS42NSwwbDAsLTI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTYiIGZpbGw9IiM2MjRhZmYiIGQ9Im0xNzYuMDksMTQxLjE0bC0yNS42NDk5OSwwbDAsMjIuMTlsNDcuODQsMGwwLC00Ny44NGwtMjIuMTksMGwwLDI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTciIGZpbGw9IiMzNmNmZDEiIGQ9Im0xMjQuNzksODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTgiIGZpbGw9IiMzNmNmZDEiIGQ9Im0wLDY0LjE5bDI1LjY1LDBsMCwyNS42NWwtMjUuNjUsMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzE5IiBmaWxsPSIjNjI0YWZmIiBkPSJtMTk4LjI4LDg5Ljg0bDI1LjY0OTk5LDBsMCwyNS42NDk5OWwtMjUuNjQ5OTksMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIwIiBmaWxsPSIjMzZjZmQxIiBkPSJtMTk4LjI4LDY0LjE5bDI1LjY0OTk5LDBsMCwyNS42NWwtMjUuNjQ5OTksMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIxIiBmaWxsPSIjNjI0YWZmIiBkPSJtMTUwLjQ0LDQybDAsMjIuMTlsMjUuNjQ5OTksMGwwLDI1LjY1bDIyLjE5LDBsMCwtNDcuODRsLTQ3Ljg0LDB6Ii8+CiAgPHBhdGggaWQ9InN2Z18yMiIgZmlsbD0iIzM2Y2ZkMSIgZD0ibTczLjQ5LDg5Ljg0bDI1LjY1LDBsMCwyNS42NDk5OWwtMjUuNjUsMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIzIiBmaWxsPSIjNjI0YWZmIiBkPSJtNDcuODQsNjQuMTlsMjUuNjUsMGwwLC0yMi4xOWwtNDcuODQsMGwwLDQ3Ljg0bDIyLjE5LDBsMCwtMjUuNjV6Ii8+CiAgPHBhdGggaWQ9InN2Z18yNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTQ3Ljg0LDExNS40OWwtMjIuMTksMGwwLDQ3Ljg0bDQ3Ljg0LDBsMCwtMjIuMTlsLTI1LjY1LDBsMCwtMjUuNjV6Ii8+CiA8L2c+Cjwvc3ZnPg==&labelColor=white)](https://modelscope.cn/models/PaddlePaddle/PaddleOCR-VL)
+[![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-black.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAF8AAABYCAMAAACkl9t/AAAAk1BMVEVHcEz/nQv/nQv/nQr/nQv/nQr/nQv/nQv/nQr/wRf/txT/pg7/yRr/rBD/zRz/ngv/oAz/zhz/nwv/txT/ngv/0B3+zBz/nQv/0h7/wxn/vRb/thXkuiT/rxH/pxD/ogzcqyf/nQvTlSz/czCxky7/SjifdjT/Mj3+Mj3wMj15aTnDNz+DSD9RTUBsP0FRO0Q6O0WyIxEIAAAAGHRSTlMADB8zSWF3krDDw8TJ1NbX5efv8ff9/fxKDJ9uAAAGKklEQVR42u2Z63qjOAyGC4RwCOfB2JAGqrSb2WnTw/1f3UaWcSGYNKTdf/P+mOkTrE+yJBulvfvLT2A5ruenaVHyIks33npl/6C4s/ZLAM45SOi/1FtZPyFur1OYofBX3w7d54Bxm+E8db+nDr12ttmESZ4zludJEG5S7TO72YPlKZFyE+YCYUJTBZsMiNS5Sd7NlDmKM2Eg2JQg8awbglfqgbhArjxkS7dgp2RH6hc9AMLdZYUtZN5DJr4molC8BfKrEkPKEnEVjLbgW1fLy77ZVOJagoIcLIl+IxaQZGjiX597HopF5CkaXVMDO9Pyix3AFV3kw4lQLCbHuMovz8FallbcQIJ5Ta0vks9RnolbCK84BtjKRS5uA43hYoZcOBGIG2Epbv6CvFVQ8m8loh66WNySsnN7htL58LNp+NXT8/PhXiBXPMjLSxtwp8W9f/1AngRierBkA+kk/IpUSOeKByzn8y3kAAAfh//0oXgV4roHm/kz4E2z//zRc3/lgwBzbM2mJxQEa5pqgX7d1L0htrhx7LKxOZlKbwcAWyEOWqYSI8YPtgDQVjpB5nvaHaSnBaQSD6hweDi8PosxD6/PT09YY3xQA7LTCTKfYX+QHpA0GCcqmEHvr/cyfKQTEuwgbs2kPxJEB0iNjfJcCTPyocx+A0griHSmADiC91oNGVwJ69RudYe65vJmoqfpul0lrqXadW0jFKH5BKwAeCq+Den7s+3zfRJzA61/Uj/9H/VzLKTx9jFPPdXeeP+L7WEvDLAKAIoF8bPTKT0+TM7W8ePj3Rz/Yn3kOAp2f1Kf0Weony7pn/cPydvhQYV+eFOfmOu7VB/ViPe34/EN3RFHY/yRuT8ddCtMPH/McBAT5s+vRde/gf2c/sPsjLK+m5IBQF5tO+h2tTlBGnP6693JdsvofjOPnnEHkh2TnV/X1fBl9S5zrwuwF8NFrAVJVwCAPTe8gaJlomqlp0pv4Pjn98tJ/t/fL++6unpR1YGC2n/KCoa0tTLoKiEeUPDl94nj+5/Tv3/eT5vBQ60X1S0oZr+IWRR8Ldhu7AlLjPISlJcO9vrFotky9SpzDequlwEir5beYAc0R7D9KS1DXva0jhYRDXoExPdc6yw5GShkZXe9QdO/uOvHofxjrV/TNS6iMJS+4TcSTgk9n5agJdBQbB//IfF/HpvPt3Tbi7b6I6K0R72p6ajryEJrENW2bbeVUGjfgoals4L443c7BEE4mJO2SpbRngxQrAKRudRzGQ8jVOL2qDVjjI8K1gc3TIJ5KiFZ1q+gdsARPB4NQS4AjwVSt72DSoXNyOWUrU5mQ9nRYyjp89Xo7oRI6Bga9QNT1mQ/ptaJq5T/7WcgAZywR/XlPGAUDdet3LE+qS0TI+g+aJU8MIqjo0Kx8Ly+maxLjJmjQ18rA0YCkxLQbUZP1WqdmyQGJLUm7VnQFqodmXSqmRrdVpqdzk5LvmvgtEcW8PMGdaS23EOWyDVbACZzUJPaqMbjDxpA3Qrgl0AikimGDbqmyT8P8NOYiqrldF8rX+YN7TopX4UoHuSCYY7cgX4gHwclQKl1zhx0THf+tCAUValzjI7Wg9EhptrkIcfIJjA94evOn8B2eHaVzvBrnl2ig0So6hvPaz0IGcOvTHvUIlE2+prqAxLSQxZlU2stql1NqCCLdIiIN/i1DBEHUoElM9dBravbiAnKqgpi4IBkw+utSPIoBijDXJipSVV7MpOEJUAc5Qmm3BnUN+w3hteEieYKfRZSIUcXKMVf0u5wD4EwsUNVvZOtUT7A2GkffHjByWpHqvRBYrTV72a6j8zZ6W0DTE86Hn04bmyWX3Ri9WH7ZU6Q7h+ZHo0nHUAcsQvVhXRDZHChwiyi/hnPuOsSEF6Exk3o6Y9DT1eZ+6cASXk2Y9k+6EOQMDGm6WBK10wOQJCBwren86cPPWUcRAnTVjGcU1LBgs9FURiX/e6479yZcLwCBmTxiawEwrOcleuu12t3tbLv/N4RLYIBhYexm7Fcn4OJcn0+zc+s8/VfPeddZHAGN6TT8eGczHdR/Gts1/MzDkThr23zqrVfAMFT33Nx1RJsx1k5zuWILLnG/vsH+Fv5D4NTVcp1Gzo8AAAAAElFTkSuQmCC&labelColor=white)](https://huggingface.co/spaces/PaddlePaddle/PaddleOCR-VL_Online_Demo)
+[![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-black?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjIzIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCiA8Zz4KICA8dGl0bGU+TGF5ZXIgMTwvdGl0bGU+CiAgPHBhdGggaWQ9InN2Z18xNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTAsODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTUiIGZpbGw9IiM2MjRhZmYiIGQ9Im05OS4xNCwxMTUuNDlsMjUuNjUsMGwwLDI1LjY1bC0yNS42NSwwbDAsLTI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTYiIGZpbGw9IiM2MjRhZmYiIGQ9Im0xNzYuMDksMTQxLjE0bC0yNS42NDk5OSwwbDAsMjIuMTlsNDcuODQsMGwwLC00Ny44NGwtMjIuMTksMGwwLDI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTciIGZpbGw9IiMzNmNmZDEiIGQ9Im0xMjQuNzksODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTgiIGZpbGw9IiMzNmNmZDEiIGQ9Im0wLDY0LjE5bDI1LjY1LDBsMCwyNS42NWwtMjUuNjUsMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzE5IiBmaWxsPSIjNjI0YWZmIiBkPSJtMTk4LjI4LDg5Ljg0bDI1LjY0OTk5LDBsMCwyNS42NDk5OWwtMjUuNjQ5OTksMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIwIiBmaWxsPSIjMzZjZmQxIiBkPSJtMTk4LjI4LDY0LjE5bDI1LjY0OTk5LDBsMCwyNS42NWwtMjUuNjQ5OTksMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIxIiBmaWxsPSIjNjI0YWZmIiBkPSJtMTUwLjQ0LDQybDAsMjIuMTlsMjUuNjQ5OTksMGwwLDI1LjY1bDIyLjE5LDBsMCwtNDcuODRsLTQ3Ljg0LDB6Ii8+CiAgPHBhdGggaWQ9InN2Z18yMiIgZmlsbD0iIzM2Y2ZkMSIgZD0ibTczLjQ5LDg5Ljg0bDI1LjY1LDBsMCwyNS42NDk5OWwtMjUuNjUsMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIzIiBmaWxsPSIjNjI0YWZmIiBkPSJtNDcuODQsNjQuMTlsMjUuNjUsMGwwLC0yMi4xOWwtNDcuODQsMGwwLDQ3Ljg0bDIyLjE5LDBsMCwtMjUuNjV6Ii8+CiAgPHBhdGggaWQ9InN2Z18yNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTQ3Ljg0LDExNS40OWwtMjIuMTksMGwwLDQ3Ljg0bDQ3Ljg0LDBsMCwtMjIuMTlsLTI1LjY1LDBsMCwtMjUuNjV6Ii8+CiA8L2c+Cjwvc3ZnPg==&labelColor=white)](https://modelscope.cn/studios/PaddlePaddle/PaddleOCR-VL_Online_Demo/summary)
+[![Discord](https://img.shields.io/badge/Discord-ERNIE-5865F2?logo=discord&logoColor=white)](https://discord.gg/JPmZXDsEEK)
+[![X](https://img.shields.io/badge/X-PaddlePaddle-6080F0)](https://x.com/PaddlePaddle)
+[![License](https://img.shields.io/badge/license-Apache_2.0-green)](./LICENSE)
+**🔥 Official Demo**: [Baidu AI Studio](https://aistudio.baidu.com/application/detail/98365) |
+**📝 arXiv**: [Technical Report](https://arxiv.org/pdf/2510.14528)
+</div>
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/allmetric.png" width="800"/>
+</div>
+## Introduction
+**PaddleOCR-VL** is a SOTA and resource-efficient model tailored for document parsing. Its core component is PaddleOCR-VL-0.9B, a compact yet powerful vision-language model (VLM) that integrates a NaViT-style dynamic resolution visual encoder with the ERNIE-4.5-0.3B language model to enable accurate element recognition. This innovative model efficiently supports 109 languages and excels in recognizing complex elements (e.g., text, tables, formulas, and charts), while maintaining minimal resource consumption. Through comprehensive evaluations on widely used public benchmarks and in-house benchmarks, PaddleOCR-VL achieves SOTA performance in both page-level document parsing and element-level recognition. It significantly outperforms existing solutions, exhibits strong competitiveness against top-tier VLMs, and delivers fast inference speeds. These strengths make it highly suitable for practical deployment in real-world scenarios.
+### **Core Features**
+1. **Compact yet Powerful VLM Architecture:** We present a novel vision-language model that is specifically designed for resource-efficient inference, achieving outstanding performance in element recognition. By integrating a NaViT-style dynamic high-resolution visual encoder with the lightweight ERNIE-4.5-0.3B language model, we significantly enhance the model’s recognition capabilities and decoding efficiency. This integration maintains high accuracy while reducing computational demands, making it well-suited for efficient and practical document processing applications.
+2. **SOTA Performance on Document Parsing:** PaddleOCR-VL achieves state-of-the-art performance in both page-level document parsing and element-level recognition. It significantly outperforms existing pipeline-based solutions and exhibiting strong competitiveness against leading vision-language models (VLMs) in document parsing. Moreover, it excels in recognizing complex document elements, such as text, tables, formulas, and charts, making it suitable for a wide range of challenging content types, including handwritten text and historical documents. This makes it highly versatile and suitable for a wide range of document types and scenarios.
+3. **Multilingual Support:** PaddleOCR-VL Supports 109 languages, covering major global languages, including but not limited to Chinese, English, Japanese, Latin, and Korean, as well as languages with different scripts and structures, such as Russian (Cyrillic script), Arabic, Hindi (Devanagari script), and Thai. This broad language coverage substantially enhances the applicability of our system to multilingual and globalized document processing scenarios.
+### **Model Architecture**
+<!-- PaddleOCR-VL decomposes the complex task of document parsing into a two stages. The first stage, PP-DocLayoutV2, is responsible for layout analysis, where it localizes semantic regions and predicts their reading order. Subsequently, the second stage, PaddleOCR-VL-0.9B, leverages these layout predictions to perform fine-grained recognition of diverse content, including text, tables, formulas, and charts. Finally, a lightweight post-processing module aggregates the outputs from both stages and formats the final document into structured Markdown and JSON. -->
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/paddleocrvl.png" width="800"/>
+</div>
+## News
+* ```2025.10.16``` 🚀 We release [PaddleOCR-VL](https://github.com/PaddlePaddle/PaddleOCR), — a multilingual documents parsing via a 0.9B Ultra-Compact Vision-Language Model with SOTA performance.
+* ```2025.10.29``` Supports calling the core module PaddleOCR-VL-0.9B of PaddleOCR-VL via the `transformers` library.
+## Usage
+### Install Dependencies
+Install [PaddlePaddle](https://www.paddlepaddle.org.cn/install/quick) and [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR):
+```bash
+python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+python -m pip install -U "paddleocr[doc-parser]"
+python -m pip install https://paddle-whl.bj.bcebos.com/nightly/cu126/safetensors/safetensors-0.6.2.dev0-cp38-abi3-linux_x86_64.whl
+```
+> For Windows users, please use WSL or a Docker container.
+### Basic Usage
+CLI usage:
+```bash
+paddleocr doc_parser -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png
+```
+Python API usage:
+```python
+from paddleocr import PaddleOCRVL
+pipeline = PaddleOCRVL()
+output = pipeline.predict("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png")
+for res in output:
+    res.print()
+    res.save_to_json(save_path="output")
+    res.save_to_markdown(save_path="output")
+```
+### Accelerate VLM Inference via Optimized Inference Servers
+1. Start the VLM inference server (the default port is `8080`):
+    ```bash
+    docker run \
+        --rm \
+        --gpus all \
+        --network host \
+        ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlex-genai-vllm-server
+    ```
+2. Call the PaddleOCR CLI or Python API:
+    ```bash
+    paddleocr doc_parser \
+        -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png \
+        --vl_rec_backend vllm-server \
+        --vl_rec_server_url http://127.0.0.1:8080/v1
+    ```
+    ```python
+    from paddleocr import PaddleOCRVL
+    pipeline = PaddleOCRVL(vl_rec_backend="vllm-server", vl_rec_server_url="http://127.0.0.1:8080/v1")
+    output = pipeline.predict("https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png")
+    for res in output:
+        res.print()
+        res.save_to_json(save_path="output")
+        res.save_to_markdown(save_path="output")
+    ```
+**For more usage details and parameter explanations, see the [documentation](https://www.paddleocr.ai/latest/en/version3.x/pipeline_usage/PaddleOCR-VL.html).**
+## PaddleOCR-VL-0.9B Usage with transformers
+Currently, we support inference using the PaddleOCR-VL-0.9B model with the `transformers` library, which can recognize texts, formulas, tables, and chart elements. In the future, we plan to support full document parsing inference with `transformers`. Below is a simple script we provide to support inference using the PaddleOCR-VL-0.9B model with `transformers`.
+> [!NOTE]
+> Note: We currently recommend using the official method for inference, as it is faster and supports page-level document parsing. The example code below only supports element-level recognition.
+```python
+from PIL import Image
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+CHOSEN_TASK = "ocr"  # Options: 'ocr' | 'table' | 'chart' | 'formula'
+PROMPTS = {
+    "ocr": "OCR:",
+    "table": "Table Recognition:",
+    "formula": "Formula Recognition:",
+    "chart": "Chart Recognition:",
+}
+model_path = "PaddlePaddle/PaddleOCR-VL"
+image_path = "test.png"
+image = Image.open(image_path).convert("RGB")
+model = AutoModelForCausalLM.from_pretrained(
+    model_path, trust_remote_code=True, torch_dtype=torch.bfloat16
+).to(DEVICE).eval()
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+messages = [
+    {"role": "user",
+     "content": [
+            {"type": "image", "image": image},
+            {"type": "text", "text": PROMPTS[CHOSEN_TASK]},
+        ]
+    }
+]
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+	return_tensors="pt"
+).to(DEVICE)
+outputs = model.generate(**inputs, max_new_tokens=1024)
+outputs = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+print(outputs)
+```
+## Performance
+### Page-Level Document Parsing
+#### 1. OmniDocBench v1.5
+##### PaddleOCR-VL achieves SOTA performance for overall, text, formula, tables and reading order on OmniDocBench v1.5
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/omni15.png" width="800"/>
+</div>
+####  2. OmniDocBench v1.0
+##### PaddleOCR-VL achieves SOTA performance for almost all metrics of overall, text, formula, tables and reading order on OmniDocBench v1.0
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/omni10.png" width="800"/>
+</div>
+> **Notes:**
+> - The metrics are from [MinerU](https://github.com/opendatalab/MinerU), [OmniDocBench](https://github.com/opendatalab/OmniDocBench), and our own internal evaluations.
+### Element-level Recognition
+#### 1. Text
+**Comparison of OmniDocBench-OCR-block Performance**
+PaddleOCR-VL’s robust and versatile capability in handling diverse document types, establishing it as the leading method in the OmniDocBench-OCR-block performance evaluation.
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/omnibenchocr.png" width="800"/>
+</div>
+**Comparison of In-house-OCR Performance**
+In-house-OCR provides a evaluation of performance across multiple languages and text types. Our model demonstrates outstanding accuracy with the lowest edit distances in all evaluated scripts.
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/inhouseocr.png" width="800"/>
+</div>
+#### 2. Table
+**Comparison of In-house-Table Performance**
+Our self-built evaluation set contains diverse types of table images, such as Chinese, English, mixed Chinese-English, and tables with various characteristics like full, partial, or no borders, book/manual formats, lists, academic papers, merged cells, as well as low-quality, watermarked, etc. PaddleOCR-VL achieves remarkable performance across all categories.
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/inhousetable.png" width="600"/>
+</div>
+#### 3. Formula
+**Comparison of In-house-Formula Performance**
+In-house-Formula evaluation set contains simple prints, complex prints, camera scans, and handwritten formulas. PaddleOCR-VL demonstrates the best performance in every category.
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/inhouse-formula.png" width="500"/>
+</div>
+#### 4. Chart
+**Comparison of In-house-Chart Performance**
+The evaluation set is broadly categorized into 11 chart categories, including bar-line hybrid, pie, 100% stacked bar, area, bar, bubble, histogram, line, scatterplot, stacked area, and stacked bar. PaddleOCR-VL not only outperforms expert OCR VLMs but also surpasses some 72B-level multimodal language models.
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/inhousechart.png" width="400"/>
+</div>
+## Visualization
+### Comprehensive Document Parsing
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/overview1.jpg" width="600"/>
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/overview2.jpg" width="600"/>
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/overview3.jpg" width="600"/>
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/overview4.jpg" width="600"/>
+</div>
+### Text
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/text_english_arabic.jpg" width="300" style="display: inline-block;"/>
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/text_handwriting_02.jpg" width="300" style="display: inline-block;"/>
+</div>
+### Table
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/table_01.jpg" width="300" style="display: inline-block;"/>
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/table_02.jpg" width="300" style="display: inline-block;"/>
+</div>
+### Formula
+<div align="center">
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/formula_EN.jpg" width="300" style="display: inline-block;"/>
+<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/formula_ZH.jpg" width="300" style="display: inline-block;"/>
+</div>
+### Chart
+<div align="center">
+  <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/chart_01.jpg" width="300" style="display: inline-block;"/>
+  <img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/chart_02.jpg" width="300" style="display: inline-block;"/>
+</div>
+## Acknowledgments
+We would like to thank [ERNIE](https://github.com/PaddlePaddle/ERNIE), [Keye](https://github.com/Kwai-Keye/Keye), [MinerU](https://github.com/opendatalab/MinerU), [OmniDocBench](https://github.com/opendatalab/OmniDocBench) for providing valuable code, model weights and benchmarks. We also appreciate everyone's contribution to this open-source project!
+## Citation
+If you find PaddleOCR-VL helpful, feel free to give us a star and citation.
+```bibtex
+@misc{cui2025paddleocrvlboostingmultilingualdocument,
+      title={PaddleOCR-VL: Boosting Multilingual Document Parsing via a 0.9B Ultra-Compact Vision-Language Model},
+      author={Cheng Cui and Ting Sun and Suyin Liang and Tingquan Gao and Zelun Zhang and Jiaxuan Liu and Xueqing Wang and Changda Zhou and Hongen Liu and Manhui Lin and Yue Zhang and Yubo Zhang and Handong Zheng and Jing Zhang and Jun Zhang and Yi Liu and Dianhai Yu and Yanjun Ma},
+      year={2025},
+      eprint={2510.14528},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2510.14528},
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,1021 @@

+{
+  "<ecel>": 101308,
+  "<fcel>": 101309,
+  "<lcel>": 101311,
+  "<nl>": 101313,
+  "<ucel>": 101312,
+  "<xcel>": 101310,
+  "<|AUDIO_PLACEHOLDER|>": 100296,
+  "<|CROP_COL_SEP|>": 101301,
+  "<|CROP_ROW_SEP|>": 101302,
+  "<|IMAGE_END|>": 101306,
+  "<|IMAGE_PLACEHOLDER|>": 100295,
+  "<|IMAGE_SEP|>": 101303,
+  "<|IMAGE_START|>": 101305,
+  "<|LOC_0|>": 100297,
+  "<|LOC_1000|>": 101297,
+  "<|LOC_100|>": 100397,
+  "<|LOC_101|>": 100398,
+  "<|LOC_102|>": 100399,
+  "<|LOC_103|>": 100400,
+  "<|LOC_104|>": 100401,
+  "<|LOC_105|>": 100402,
+  "<|LOC_106|>": 100403,
+  "<|LOC_107|>": 100404,
+  "<|LOC_108|>": 100405,
+  "<|LOC_109|>": 100406,
+  "<|LOC_10|>": 100307,
+  "<|LOC_110|>": 100407,
+  "<|LOC_111|>": 100408,
+  "<|LOC_112|>": 100409,
+  "<|LOC_113|>": 100410,
+  "<|LOC_114|>": 100411,
+  "<|LOC_115|>": 100412,
+  "<|LOC_116|>": 100413,
+  "<|LOC_117|>": 100414,
+  "<|LOC_118|>": 100415,
+  "<|LOC_119|>": 100416,
+  "<|LOC_11|>": 100308,
+  "<|LOC_120|>": 100417,
+  "<|LOC_121|>": 100418,
+  "<|LOC_122|>": 100419,
+  "<|LOC_123|>": 100420,
+  "<|LOC_124|>": 100421,
+  "<|LOC_125|>": 100422,
+  "<|LOC_126|>": 100423,
+  "<|LOC_127|>": 100424,
+  "<|LOC_128|>": 100425,
+  "<|LOC_129|>": 100426,
+  "<|LOC_12|>": 100309,
+  "<|LOC_130|>": 100427,
+  "<|LOC_131|>": 100428,
+  "<|LOC_132|>": 100429,
+  "<|LOC_133|>": 100430,
+  "<|LOC_134|>": 100431,
+  "<|LOC_135|>": 100432,
+  "<|LOC_136|>": 100433,
+  "<|LOC_137|>": 100434,
+  "<|LOC_138|>": 100435,
+  "<|LOC_139|>": 100436,
+  "<|LOC_13|>": 100310,
+  "<|LOC_140|>": 100437,
+  "<|LOC_141|>": 100438,
+  "<|LOC_142|>": 100439,
+  "<|LOC_143|>": 100440,
+  "<|LOC_144|>": 100441,
+  "<|LOC_145|>": 100442,
+  "<|LOC_146|>": 100443,
+  "<|LOC_147|>": 100444,
+  "<|LOC_148|>": 100445,
+  "<|LOC_149|>": 100446,
+  "<|LOC_14|>": 100311,
+  "<|LOC_150|>": 100447,
+  "<|LOC_151|>": 100448,
+  "<|LOC_152|>": 100449,
+  "<|LOC_153|>": 100450,
+  "<|LOC_154|>": 100451,
+  "<|LOC_155|>": 100452,
+  "<|LOC_156|>": 100453,
+  "<|LOC_157|>": 100454,
+  "<|LOC_158|>": 100455,
+  "<|LOC_159|>": 100456,
+  "<|LOC_15|>": 100312,
+  "<|LOC_160|>": 100457,
+  "<|LOC_161|>": 100458,
+  "<|LOC_162|>": 100459,
+  "<|LOC_163|>": 100460,
+  "<|LOC_164|>": 100461,
+  "<|LOC_165|>": 100462,
+  "<|LOC_166|>": 100463,
+  "<|LOC_167|>": 100464,
+  "<|LOC_168|>": 100465,
+  "<|LOC_169|>": 100466,
+  "<|LOC_16|>": 100313,
+  "<|LOC_170|>": 100467,
+  "<|LOC_171|>": 100468,
+  "<|LOC_172|>": 100469,
+  "<|LOC_173|>": 100470,
+  "<|LOC_174|>": 100471,
+  "<|LOC_175|>": 100472,
+  "<|LOC_176|>": 100473,
+  "<|LOC_177|>": 100474,
+  "<|LOC_178|>": 100475,
+  "<|LOC_179|>": 100476,
+  "<|LOC_17|>": 100314,
+  "<|LOC_180|>": 100477,
+  "<|LOC_181|>": 100478,
+  "<|LOC_182|>": 100479,
+  "<|LOC_183|>": 100480,
+  "<|LOC_184|>": 100481,
+  "<|LOC_185|>": 100482,
+  "<|LOC_186|>": 100483,
+  "<|LOC_187|>": 100484,
+  "<|LOC_188|>": 100485,
+  "<|LOC_189|>": 100486,
+  "<|LOC_18|>": 100315,
+  "<|LOC_190|>": 100487,
+  "<|LOC_191|>": 100488,
+  "<|LOC_192|>": 100489,
+  "<|LOC_193|>": 100490,
+  "<|LOC_194|>": 100491,
+  "<|LOC_195|>": 100492,
+  "<|LOC_196|>": 100493,
+  "<|LOC_197|>": 100494,
+  "<|LOC_198|>": 100495,
+  "<|LOC_199|>": 100496,
+  "<|LOC_19|>": 100316,
+  "<|LOC_1|>": 100298,
+  "<|LOC_200|>": 100497,
+  "<|LOC_201|>": 100498,
+  "<|LOC_202|>": 100499,
+  "<|LOC_203|>": 100500,
+  "<|LOC_204|>": 100501,
+  "<|LOC_205|>": 100502,
+  "<|LOC_206|>": 100503,
+  "<|LOC_207|>": 100504,
+  "<|LOC_208|>": 100505,
+  "<|LOC_209|>": 100506,
+  "<|LOC_20|>": 100317,
+  "<|LOC_210|>": 100507,
+  "<|LOC_211|>": 100508,
+  "<|LOC_212|>": 100509,
+  "<|LOC_213|>": 100510,
+  "<|LOC_214|>": 100511,
+  "<|LOC_215|>": 100512,
+  "<|LOC_216|>": 100513,
+  "<|LOC_217|>": 100514,
+  "<|LOC_218|>": 100515,
+  "<|LOC_219|>": 100516,
+  "<|LOC_21|>": 100318,
+  "<|LOC_220|>": 100517,
+  "<|LOC_221|>": 100518,
+  "<|LOC_222|>": 100519,
+  "<|LOC_223|>": 100520,
+  "<|LOC_224|>": 100521,
+  "<|LOC_225|>": 100522,
+  "<|LOC_226|>": 100523,
+  "<|LOC_227|>": 100524,
+  "<|LOC_228|>": 100525,
+  "<|LOC_229|>": 100526,
+  "<|LOC_22|>": 100319,
+  "<|LOC_230|>": 100527,
+  "<|LOC_231|>": 100528,
+  "<|LOC_232|>": 100529,
+  "<|LOC_233|>": 100530,
+  "<|LOC_234|>": 100531,
+  "<|LOC_235|>": 100532,
+  "<|LOC_236|>": 100533,
+  "<|LOC_237|>": 100534,
+  "<|LOC_238|>": 100535,
+  "<|LOC_239|>": 100536,
+  "<|LOC_23|>": 100320,
+  "<|LOC_240|>": 100537,
+  "<|LOC_241|>": 100538,
+  "<|LOC_242|>": 100539,
+  "<|LOC_243|>": 100540,
+  "<|LOC_244|>": 100541,
+  "<|LOC_245|>": 100542,
+  "<|LOC_246|>": 100543,
+  "<|LOC_247|>": 100544,
+  "<|LOC_248|>": 100545,
+  "<|LOC_249|>": 100546,
+  "<|LOC_24|>": 100321,
+  "<|LOC_250|>": 100547,
+  "<|LOC_251|>": 100548,
+  "<|LOC_252|>": 100549,
+  "<|LOC_253|>": 100550,
+  "<|LOC_254|>": 100551,
+  "<|LOC_255|>": 100552,
+  "<|LOC_256|>": 100553,
+  "<|LOC_257|>": 100554,
+  "<|LOC_258|>": 100555,
+  "<|LOC_259|>": 100556,
+  "<|LOC_25|>": 100322,
+  "<|LOC_260|>": 100557,
+  "<|LOC_261|>": 100558,
+  "<|LOC_262|>": 100559,
+  "<|LOC_263|>": 100560,
+  "<|LOC_264|>": 100561,
+  "<|LOC_265|>": 100562,
+  "<|LOC_266|>": 100563,
+  "<|LOC_267|>": 100564,
+  "<|LOC_268|>": 100565,
+  "<|LOC_269|>": 100566,
+  "<|LOC_26|>": 100323,
+  "<|LOC_270|>": 100567,
+  "<|LOC_271|>": 100568,
+  "<|LOC_272|>": 100569,
+  "<|LOC_273|>": 100570,
+  "<|LOC_274|>": 100571,
+  "<|LOC_275|>": 100572,
+  "<|LOC_276|>": 100573,
+  "<|LOC_277|>": 100574,
+  "<|LOC_278|>": 100575,
+  "<|LOC_279|>": 100576,
+  "<|LOC_27|>": 100324,
+  "<|LOC_280|>": 100577,
+  "<|LOC_281|>": 100578,
+  "<|LOC_282|>": 100579,
+  "<|LOC_283|>": 100580,
+  "<|LOC_284|>": 100581,
+  "<|LOC_285|>": 100582,
+  "<|LOC_286|>": 100583,
+  "<|LOC_287|>": 100584,
+  "<|LOC_288|>": 100585,
+  "<|LOC_289|>": 100586,
+  "<|LOC_28|>": 100325,
+  "<|LOC_290|>": 100587,
+  "<|LOC_291|>": 100588,
+  "<|LOC_292|>": 100589,
+  "<|LOC_293|>": 100590,
+  "<|LOC_294|>": 100591,
+  "<|LOC_295|>": 100592,
+  "<|LOC_296|>": 100593,
+  "<|LOC_297|>": 100594,
+  "<|LOC_298|>": 100595,
+  "<|LOC_299|>": 100596,
+  "<|LOC_29|>": 100326,
+  "<|LOC_2|>": 100299,
+  "<|LOC_300|>": 100597,
+  "<|LOC_301|>": 100598,
+  "<|LOC_302|>": 100599,
+  "<|LOC_303|>": 100600,
+  "<|LOC_304|>": 100601,
+  "<|LOC_305|>": 100602,
+  "<|LOC_306|>": 100603,
+  "<|LOC_307|>": 100604,
+  "<|LOC_308|>": 100605,
+  "<|LOC_309|>": 100606,
+  "<|LOC_30|>": 100327,
+  "<|LOC_310|>": 100607,
+  "<|LOC_311|>": 100608,
+  "<|LOC_312|>": 100609,
+  "<|LOC_313|>": 100610,
+  "<|LOC_314|>": 100611,
+  "<|LOC_315|>": 100612,
+  "<|LOC_316|>": 100613,
+  "<|LOC_317|>": 100614,
+  "<|LOC_318|>": 100615,
+  "<|LOC_319|>": 100616,
+  "<|LOC_31|>": 100328,
+  "<|LOC_320|>": 100617,
+  "<|LOC_321|>": 100618,
+  "<|LOC_322|>": 100619,
+  "<|LOC_323|>": 100620,
+  "<|LOC_324|>": 100621,
+  "<|LOC_325|>": 100622,
+  "<|LOC_326|>": 100623,
+  "<|LOC_327|>": 100624,
+  "<|LOC_328|>": 100625,
+  "<|LOC_329|>": 100626,
+  "<|LOC_32|>": 100329,
+  "<|LOC_330|>": 100627,
+  "<|LOC_331|>": 100628,
+  "<|LOC_332|>": 100629,
+  "<|LOC_333|>": 100630,
+  "<|LOC_334|>": 100631,
+  "<|LOC_335|>": 100632,
+  "<|LOC_336|>": 100633,
+  "<|LOC_337|>": 100634,
+  "<|LOC_338|>": 100635,
+  "<|LOC_339|>": 100636,
+  "<|LOC_33|>": 100330,
+  "<|LOC_340|>": 100637,
+  "<|LOC_341|>": 100638,
+  "<|LOC_342|>": 100639,
+  "<|LOC_343|>": 100640,
+  "<|LOC_344|>": 100641,
+  "<|LOC_345|>": 100642,
+  "<|LOC_346|>": 100643,
+  "<|LOC_347|>": 100644,
+  "<|LOC_348|>": 100645,
+  "<|LOC_349|>": 100646,
+  "<|LOC_34|>": 100331,
+  "<|LOC_350|>": 100647,
+  "<|LOC_351|>": 100648,
+  "<|LOC_352|>": 100649,
+  "<|LOC_353|>": 100650,
+  "<|LOC_354|>": 100651,
+  "<|LOC_355|>": 100652,
+  "<|LOC_356|>": 100653,
+  "<|LOC_357|>": 100654,
+  "<|LOC_358|>": 100655,
+  "<|LOC_359|>": 100656,
+  "<|LOC_35|>": 100332,
+  "<|LOC_360|>": 100657,
+  "<|LOC_361|>": 100658,
+  "<|LOC_362|>": 100659,
+  "<|LOC_363|>": 100660,
+  "<|LOC_364|>": 100661,
+  "<|LOC_365|>": 100662,
+  "<|LOC_366|>": 100663,
+  "<|LOC_367|>": 100664,
+  "<|LOC_368|>": 100665,
+  "<|LOC_369|>": 100666,
+  "<|LOC_36|>": 100333,
+  "<|LOC_370|>": 100667,
+  "<|LOC_371|>": 100668,
+  "<|LOC_372|>": 100669,
+  "<|LOC_373|>": 100670,
+  "<|LOC_374|>": 100671,
+  "<|LOC_375|>": 100672,
+  "<|LOC_376|>": 100673,
+  "<|LOC_377|>": 100674,
+  "<|LOC_378|>": 100675,
+  "<|LOC_379|>": 100676,
+  "<|LOC_37|>": 100334,
+  "<|LOC_380|>": 100677,
+  "<|LOC_381|>": 100678,
+  "<|LOC_382|>": 100679,
+  "<|LOC_383|>": 100680,
+  "<|LOC_384|>": 100681,
+  "<|LOC_385|>": 100682,
+  "<|LOC_386|>": 100683,
+  "<|LOC_387|>": 100684,
+  "<|LOC_388|>": 100685,
+  "<|LOC_389|>": 100686,
+  "<|LOC_38|>": 100335,
+  "<|LOC_390|>": 100687,
+  "<|LOC_391|>": 100688,
+  "<|LOC_392|>": 100689,
+  "<|LOC_393|>": 100690,
+  "<|LOC_394|>": 100691,
+  "<|LOC_395|>": 100692,
+  "<|LOC_396|>": 100693,
+  "<|LOC_397|>": 100694,
+  "<|LOC_398|>": 100695,
+  "<|LOC_399|>": 100696,
+  "<|LOC_39|>": 100336,
+  "<|LOC_3|>": 100300,
+  "<|LOC_400|>": 100697,
+  "<|LOC_401|>": 100698,
+  "<|LOC_402|>": 100699,
+  "<|LOC_403|>": 100700,
+  "<|LOC_404|>": 100701,
+  "<|LOC_405|>": 100702,
+  "<|LOC_406|>": 100703,
+  "<|LOC_407|>": 100704,
+  "<|LOC_408|>": 100705,
+  "<|LOC_409|>": 100706,
+  "<|LOC_40|>": 100337,
+  "<|LOC_410|>": 100707,
+  "<|LOC_411|>": 100708,
+  "<|LOC_412|>": 100709,
+  "<|LOC_413|>": 100710,
+  "<|LOC_414|>": 100711,
+  "<|LOC_415|>": 100712,
+  "<|LOC_416|>": 100713,
+  "<|LOC_417|>": 100714,
+  "<|LOC_418|>": 100715,
+  "<|LOC_419|>": 100716,
+  "<|LOC_41|>": 100338,
+  "<|LOC_420|>": 100717,
+  "<|LOC_421|>": 100718,
+  "<|LOC_422|>": 100719,
+  "<|LOC_423|>": 100720,
+  "<|LOC_424|>": 100721,
+  "<|LOC_425|>": 100722,
+  "<|LOC_426|>": 100723,
+  "<|LOC_427|>": 100724,
+  "<|LOC_428|>": 100725,
+  "<|LOC_429|>": 100726,
+  "<|LOC_42|>": 100339,
+  "<|LOC_430|>": 100727,
+  "<|LOC_431|>": 100728,
+  "<|LOC_432|>": 100729,
+  "<|LOC_433|>": 100730,
+  "<|LOC_434|>": 100731,
+  "<|LOC_435|>": 100732,
+  "<|LOC_436|>": 100733,
+  "<|LOC_437|>": 100734,
+  "<|LOC_438|>": 100735,
+  "<|LOC_439|>": 100736,
+  "<|LOC_43|>": 100340,
+  "<|LOC_440|>": 100737,
+  "<|LOC_441|>": 100738,
+  "<|LOC_442|>": 100739,
+  "<|LOC_443|>": 100740,
+  "<|LOC_444|>": 100741,
+  "<|LOC_445|>": 100742,
+  "<|LOC_446|>": 100743,
+  "<|LOC_447|>": 100744,
+  "<|LOC_448|>": 100745,
+  "<|LOC_449|>": 100746,
+  "<|LOC_44|>": 100341,
+  "<|LOC_450|>": 100747,
+  "<|LOC_451|>": 100748,
+  "<|LOC_452|>": 100749,
+  "<|LOC_453|>": 100750,
+  "<|LOC_454|>": 100751,
+  "<|LOC_455|>": 100752,
+  "<|LOC_456|>": 100753,
+  "<|LOC_457|>": 100754,
+  "<|LOC_458|>": 100755,
+  "<|LOC_459|>": 100756,
+  "<|LOC_45|>": 100342,
+  "<|LOC_460|>": 100757,
+  "<|LOC_461|>": 100758,
+  "<|LOC_462|>": 100759,
+  "<|LOC_463|>": 100760,
+  "<|LOC_464|>": 100761,
+  "<|LOC_465|>": 100762,
+  "<|LOC_466|>": 100763,
+  "<|LOC_467|>": 100764,
+  "<|LOC_468|>": 100765,
+  "<|LOC_469|>": 100766,
+  "<|LOC_46|>": 100343,
+  "<|LOC_470|>": 100767,
+  "<|LOC_471|>": 100768,
+  "<|LOC_472|>": 100769,
+  "<|LOC_473|>": 100770,
+  "<|LOC_474|>": 100771,
+  "<|LOC_475|>": 100772,
+  "<|LOC_476|>": 100773,
+  "<|LOC_477|>": 100774,
+  "<|LOC_478|>": 100775,
+  "<|LOC_479|>": 100776,
+  "<|LOC_47|>": 100344,
+  "<|LOC_480|>": 100777,
+  "<|LOC_481|>": 100778,
+  "<|LOC_482|>": 100779,
+  "<|LOC_483|>": 100780,
+  "<|LOC_484|>": 100781,
+  "<|LOC_485|>": 100782,
+  "<|LOC_486|>": 100783,
+  "<|LOC_487|>": 100784,
+  "<|LOC_488|>": 100785,
+  "<|LOC_489|>": 100786,
+  "<|LOC_48|>": 100345,
+  "<|LOC_490|>": 100787,
+  "<|LOC_491|>": 100788,
+  "<|LOC_492|>": 100789,
+  "<|LOC_493|>": 100790,
+  "<|LOC_494|>": 100791,
+  "<|LOC_495|>": 100792,
+  "<|LOC_496|>": 100793,
+  "<|LOC_497|>": 100794,
+  "<|LOC_498|>": 100795,
+  "<|LOC_499|>": 100796,
+  "<|LOC_49|>": 100346,
+  "<|LOC_4|>": 100301,
+  "<|LOC_500|>": 100797,
+  "<|LOC_501|>": 100798,
+  "<|LOC_502|>": 100799,
+  "<|LOC_503|>": 100800,
+  "<|LOC_504|>": 100801,
+  "<|LOC_505|>": 100802,
+  "<|LOC_506|>": 100803,
+  "<|LOC_507|>": 100804,
+  "<|LOC_508|>": 100805,
+  "<|LOC_509|>": 100806,
+  "<|LOC_50|>": 100347,
+  "<|LOC_510|>": 100807,
+  "<|LOC_511|>": 100808,
+  "<|LOC_512|>": 100809,
+  "<|LOC_513|>": 100810,
+  "<|LOC_514|>": 100811,
+  "<|LOC_515|>": 100812,
+  "<|LOC_516|>": 100813,
+  "<|LOC_517|>": 100814,
+  "<|LOC_518|>": 100815,
+  "<|LOC_519|>": 100816,
+  "<|LOC_51|>": 100348,
+  "<|LOC_520|>": 100817,
+  "<|LOC_521|>": 100818,
+  "<|LOC_522|>": 100819,
+  "<|LOC_523|>": 100820,
+  "<|LOC_524|>": 100821,
+  "<|LOC_525|>": 100822,
+  "<|LOC_526|>": 100823,
+  "<|LOC_527|>": 100824,
+  "<|LOC_528|>": 100825,
+  "<|LOC_529|>": 100826,
+  "<|LOC_52|>": 100349,
+  "<|LOC_530|>": 100827,
+  "<|LOC_531|>": 100828,
+  "<|LOC_532|>": 100829,
+  "<|LOC_533|>": 100830,
+  "<|LOC_534|>": 100831,
+  "<|LOC_535|>": 100832,
+  "<|LOC_536|>": 100833,
+  "<|LOC_537|>": 100834,
+  "<|LOC_538|>": 100835,
+  "<|LOC_539|>": 100836,
+  "<|LOC_53|>": 100350,
+  "<|LOC_540|>": 100837,
+  "<|LOC_541|>": 100838,
+  "<|LOC_542|>": 100839,
+  "<|LOC_543|>": 100840,
+  "<|LOC_544|>": 100841,
+  "<|LOC_545|>": 100842,
+  "<|LOC_546|>": 100843,
+  "<|LOC_547|>": 100844,
+  "<|LOC_548|>": 100845,
+  "<|LOC_549|>": 100846,
+  "<|LOC_54|>": 100351,
+  "<|LOC_550|>": 100847,
+  "<|LOC_551|>": 100848,
+  "<|LOC_552|>": 100849,
+  "<|LOC_553|>": 100850,
+  "<|LOC_554|>": 100851,
+  "<|LOC_555|>": 100852,
+  "<|LOC_556|>": 100853,
+  "<|LOC_557|>": 100854,
+  "<|LOC_558|>": 100855,
+  "<|LOC_559|>": 100856,
+  "<|LOC_55|>": 100352,
+  "<|LOC_560|>": 100857,
+  "<|LOC_561|>": 100858,
+  "<|LOC_562|>": 100859,
+  "<|LOC_563|>": 100860,
+  "<|LOC_564|>": 100861,
+  "<|LOC_565|>": 100862,
+  "<|LOC_566|>": 100863,
+  "<|LOC_567|>": 100864,
+  "<|LOC_568|>": 100865,
+  "<|LOC_569|>": 100866,
+  "<|LOC_56|>": 100353,
+  "<|LOC_570|>": 100867,
+  "<|LOC_571|>": 100868,
+  "<|LOC_572|>": 100869,
+  "<|LOC_573|>": 100870,
+  "<|LOC_574|>": 100871,
+  "<|LOC_575|>": 100872,
+  "<|LOC_576|>": 100873,
+  "<|LOC_577|>": 100874,
+  "<|LOC_578|>": 100875,
+  "<|LOC_579|>": 100876,
+  "<|LOC_57|>": 100354,
+  "<|LOC_580|>": 100877,
+  "<|LOC_581|>": 100878,
+  "<|LOC_582|>": 100879,
+  "<|LOC_583|>": 100880,
+  "<|LOC_584|>": 100881,
+  "<|LOC_585|>": 100882,
+  "<|LOC_586|>": 100883,
+  "<|LOC_587|>": 100884,
+  "<|LOC_588|>": 100885,
+  "<|LOC_589|>": 100886,
+  "<|LOC_58|>": 100355,
+  "<|LOC_590|>": 100887,
+  "<|LOC_591|>": 100888,
+  "<|LOC_592|>": 100889,
+  "<|LOC_593|>": 100890,
+  "<|LOC_594|>": 100891,
+  "<|LOC_595|>": 100892,
+  "<|LOC_596|>": 100893,
+  "<|LOC_597|>": 100894,
+  "<|LOC_598|>": 100895,
+  "<|LOC_599|>": 100896,
+  "<|LOC_59|>": 100356,
+  "<|LOC_5|>": 100302,
+  "<|LOC_600|>": 100897,
+  "<|LOC_601|>": 100898,
+  "<|LOC_602|>": 100899,
+  "<|LOC_603|>": 100900,
+  "<|LOC_604|>": 100901,
+  "<|LOC_605|>": 100902,
+  "<|LOC_606|>": 100903,
+  "<|LOC_607|>": 100904,
+  "<|LOC_608|>": 100905,
+  "<|LOC_609|>": 100906,
+  "<|LOC_60|>": 100357,
+  "<|LOC_610|>": 100907,
+  "<|LOC_611|>": 100908,
+  "<|LOC_612|>": 100909,
+  "<|LOC_613|>": 100910,
+  "<|LOC_614|>": 100911,
+  "<|LOC_615|>": 100912,
+  "<|LOC_616|>": 100913,
+  "<|LOC_617|>": 100914,
+  "<|LOC_618|>": 100915,
+  "<|LOC_619|>": 100916,
+  "<|LOC_61|>": 100358,
+  "<|LOC_620|>": 100917,
+  "<|LOC_621|>": 100918,
+  "<|LOC_622|>": 100919,
+  "<|LOC_623|>": 100920,
+  "<|LOC_624|>": 100921,
+  "<|LOC_625|>": 100922,
+  "<|LOC_626|>": 100923,
+  "<|LOC_627|>": 100924,
+  "<|LOC_628|>": 100925,
+  "<|LOC_629|>": 100926,
+  "<|LOC_62|>": 100359,
+  "<|LOC_630|>": 100927,
+  "<|LOC_631|>": 100928,
+  "<|LOC_632|>": 100929,
+  "<|LOC_633|>": 100930,
+  "<|LOC_634|>": 100931,
+  "<|LOC_635|>": 100932,
+  "<|LOC_636|>": 100933,
+  "<|LOC_637|>": 100934,
+  "<|LOC_638|>": 100935,
+  "<|LOC_639|>": 100936,
+  "<|LOC_63|>": 100360,
+  "<|LOC_640|>": 100937,
+  "<|LOC_641|>": 100938,
+  "<|LOC_642|>": 100939,
+  "<|LOC_643|>": 100940,
+  "<|LOC_644|>": 100941,
+  "<|LOC_645|>": 100942,
+  "<|LOC_646|>": 100943,
+  "<|LOC_647|>": 100944,
+  "<|LOC_648|>": 100945,
+  "<|LOC_649|>": 100946,
+  "<|LOC_64|>": 100361,
+  "<|LOC_650|>": 100947,
+  "<|LOC_651|>": 100948,
+  "<|LOC_652|>": 100949,
+  "<|LOC_653|>": 100950,
+  "<|LOC_654|>": 100951,
+  "<|LOC_655|>": 100952,
+  "<|LOC_656|>": 100953,
+  "<|LOC_657|>": 100954,
+  "<|LOC_658|>": 100955,
+  "<|LOC_659|>": 100956,
+  "<|LOC_65|>": 100362,
+  "<|LOC_660|>": 100957,
+  "<|LOC_661|>": 100958,
+  "<|LOC_662|>": 100959,
+  "<|LOC_663|>": 100960,
+  "<|LOC_664|>": 100961,
+  "<|LOC_665|>": 100962,
+  "<|LOC_666|>": 100963,
+  "<|LOC_667|>": 100964,
+  "<|LOC_668|>": 100965,
+  "<|LOC_669|>": 100966,
+  "<|LOC_66|>": 100363,
+  "<|LOC_670|>": 100967,
+  "<|LOC_671|>": 100968,
+  "<|LOC_672|>": 100969,
+  "<|LOC_673|>": 100970,
+  "<|LOC_674|>": 100971,
+  "<|LOC_675|>": 100972,
+  "<|LOC_676|>": 100973,
+  "<|LOC_677|>": 100974,
+  "<|LOC_678|>": 100975,
+  "<|LOC_679|>": 100976,
+  "<|LOC_67|>": 100364,
+  "<|LOC_680|>": 100977,
+  "<|LOC_681|>": 100978,
+  "<|LOC_682|>": 100979,
+  "<|LOC_683|>": 100980,
+  "<|LOC_684|>": 100981,
+  "<|LOC_685|>": 100982,
+  "<|LOC_686|>": 100983,
+  "<|LOC_687|>": 100984,
+  "<|LOC_688|>": 100985,
+  "<|LOC_689|>": 100986,
+  "<|LOC_68|>": 100365,
+  "<|LOC_690|>": 100987,
+  "<|LOC_691|>": 100988,
+  "<|LOC_692|>": 100989,
+  "<|LOC_693|>": 100990,
+  "<|LOC_694|>": 100991,
+  "<|LOC_695|>": 100992,
+  "<|LOC_696|>": 100993,
+  "<|LOC_697|>": 100994,
+  "<|LOC_698|>": 100995,
+  "<|LOC_699|>": 100996,
+  "<|LOC_69|>": 100366,
+  "<|LOC_6|>": 100303,
+  "<|LOC_700|>": 100997,
+  "<|LOC_701|>": 100998,
+  "<|LOC_702|>": 100999,
+  "<|LOC_703|>": 101000,
+  "<|LOC_704|>": 101001,
+  "<|LOC_705|>": 101002,
+  "<|LOC_706|>": 101003,
+  "<|LOC_707|>": 101004,
+  "<|LOC_708|>": 101005,
+  "<|LOC_709|>": 101006,
+  "<|LOC_70|>": 100367,
+  "<|LOC_710|>": 101007,
+  "<|LOC_711|>": 101008,
+  "<|LOC_712|>": 101009,
+  "<|LOC_713|>": 101010,
+  "<|LOC_714|>": 101011,
+  "<|LOC_715|>": 101012,
+  "<|LOC_716|>": 101013,
+  "<|LOC_717|>": 101014,
+  "<|LOC_718|>": 101015,
+  "<|LOC_719|>": 101016,
+  "<|LOC_71|>": 100368,
+  "<|LOC_720|>": 101017,
+  "<|LOC_721|>": 101018,
+  "<|LOC_722|>": 101019,
+  "<|LOC_723|>": 101020,
+  "<|LOC_724|>": 101021,
+  "<|LOC_725|>": 101022,
+  "<|LOC_726|>": 101023,
+  "<|LOC_727|>": 101024,
+  "<|LOC_728|>": 101025,
+  "<|LOC_729|>": 101026,
+  "<|LOC_72|>": 100369,
+  "<|LOC_730|>": 101027,
+  "<|LOC_731|>": 101028,
+  "<|LOC_732|>": 101029,
+  "<|LOC_733|>": 101030,
+  "<|LOC_734|>": 101031,
+  "<|LOC_735|>": 101032,
+  "<|LOC_736|>": 101033,
+  "<|LOC_737|>": 101034,
+  "<|LOC_738|>": 101035,
+  "<|LOC_739|>": 101036,
+  "<|LOC_73|>": 100370,
+  "<|LOC_740|>": 101037,
+  "<|LOC_741|>": 101038,
+  "<|LOC_742|>": 101039,
+  "<|LOC_743|>": 101040,
+  "<|LOC_744|>": 101041,
+  "<|LOC_745|>": 101042,
+  "<|LOC_746|>": 101043,
+  "<|LOC_747|>": 101044,
+  "<|LOC_748|>": 101045,
+  "<|LOC_749|>": 101046,
+  "<|LOC_74|>": 100371,
+  "<|LOC_750|>": 101047,
+  "<|LOC_751|>": 101048,
+  "<|LOC_752|>": 101049,
+  "<|LOC_753|>": 101050,
+  "<|LOC_754|>": 101051,
+  "<|LOC_755|>": 101052,
+  "<|LOC_756|>": 101053,
+  "<|LOC_757|>": 101054,
+  "<|LOC_758|>": 101055,
+  "<|LOC_759|>": 101056,
+  "<|LOC_75|>": 100372,
+  "<|LOC_760|>": 101057,
+  "<|LOC_761|>": 101058,
+  "<|LOC_762|>": 101059,
+  "<|LOC_763|>": 101060,
+  "<|LOC_764|>": 101061,
+  "<|LOC_765|>": 101062,
+  "<|LOC_766|>": 101063,
+  "<|LOC_767|>": 101064,
+  "<|LOC_768|>": 101065,
+  "<|LOC_769|>": 101066,
+  "<|LOC_76|>": 100373,
+  "<|LOC_770|>": 101067,
+  "<|LOC_771|>": 101068,
+  "<|LOC_772|>": 101069,
+  "<|LOC_773|>": 101070,
+  "<|LOC_774|>": 101071,
+  "<|LOC_775|>": 101072,
+  "<|LOC_776|>": 101073,
+  "<|LOC_777|>": 101074,
+  "<|LOC_778|>": 101075,
+  "<|LOC_779|>": 101076,
+  "<|LOC_77|>": 100374,
+  "<|LOC_780|>": 101077,
+  "<|LOC_781|>": 101078,
+  "<|LOC_782|>": 101079,
+  "<|LOC_783|>": 101080,
+  "<|LOC_784|>": 101081,
+  "<|LOC_785|>": 101082,
+  "<|LOC_786|>": 101083,
+  "<|LOC_787|>": 101084,
+  "<|LOC_788|>": 101085,
+  "<|LOC_789|>": 101086,
+  "<|LOC_78|>": 100375,
+  "<|LOC_790|>": 101087,
+  "<|LOC_791|>": 101088,
+  "<|LOC_792|>": 101089,
+  "<|LOC_793|>": 101090,
+  "<|LOC_794|>": 101091,
+  "<|LOC_795|>": 101092,
+  "<|LOC_796|>": 101093,
+  "<|LOC_797|>": 101094,
+  "<|LOC_798|>": 101095,
+  "<|LOC_799|>": 101096,
+  "<|LOC_79|>": 100376,
+  "<|LOC_7|>": 100304,
+  "<|LOC_800|>": 101097,
+  "<|LOC_801|>": 101098,
+  "<|LOC_802|>": 101099,
+  "<|LOC_803|>": 101100,
+  "<|LOC_804|>": 101101,
+  "<|LOC_805|>": 101102,
+  "<|LOC_806|>": 101103,
+  "<|LOC_807|>": 101104,
+  "<|LOC_808|>": 101105,
+  "<|LOC_809|>": 101106,
+  "<|LOC_80|>": 100377,
+  "<|LOC_810|>": 101107,
+  "<|LOC_811|>": 101108,
+  "<|LOC_812|>": 101109,
+  "<|LOC_813|>": 101110,
+  "<|LOC_814|>": 101111,
+  "<|LOC_815|>": 101112,
+  "<|LOC_816|>": 101113,
+  "<|LOC_817|>": 101114,
+  "<|LOC_818|>": 101115,
+  "<|LOC_819|>": 101116,
+  "<|LOC_81|>": 100378,
+  "<|LOC_820|>": 101117,
+  "<|LOC_821|>": 101118,
+  "<|LOC_822|>": 101119,
+  "<|LOC_823|>": 101120,
+  "<|LOC_824|>": 101121,
+  "<|LOC_825|>": 101122,
+  "<|LOC_826|>": 101123,
+  "<|LOC_827|>": 101124,
+  "<|LOC_828|>": 101125,
+  "<|LOC_829|>": 101126,
+  "<|LOC_82|>": 100379,
+  "<|LOC_830|>": 101127,
+  "<|LOC_831|>": 101128,
+  "<|LOC_832|>": 101129,
+  "<|LOC_833|>": 101130,
+  "<|LOC_834|>": 101131,
+  "<|LOC_835|>": 101132,
+  "<|LOC_836|>": 101133,
+  "<|LOC_837|>": 101134,
+  "<|LOC_838|>": 101135,
+  "<|LOC_839|>": 101136,
+  "<|LOC_83|>": 100380,
+  "<|LOC_840|>": 101137,
+  "<|LOC_841|>": 101138,
+  "<|LOC_842|>": 101139,
+  "<|LOC_843|>": 101140,
+  "<|LOC_844|>": 101141,
+  "<|LOC_845|>": 101142,
+  "<|LOC_846|>": 101143,
+  "<|LOC_847|>": 101144,
+  "<|LOC_848|>": 101145,
+  "<|LOC_849|>": 101146,
+  "<|LOC_84|>": 100381,
+  "<|LOC_850|>": 101147,
+  "<|LOC_851|>": 101148,
+  "<|LOC_852|>": 101149,
+  "<|LOC_853|>": 101150,
+  "<|LOC_854|>": 101151,
+  "<|LOC_855|>": 101152,
+  "<|LOC_856|>": 101153,
+  "<|LOC_857|>": 101154,
+  "<|LOC_858|>": 101155,
+  "<|LOC_859|>": 101156,
+  "<|LOC_85|>": 100382,
+  "<|LOC_860|>": 101157,
+  "<|LOC_861|>": 101158,
+  "<|LOC_862|>": 101159,
+  "<|LOC_863|>": 101160,
+  "<|LOC_864|>": 101161,
+  "<|LOC_865|>": 101162,
+  "<|LOC_866|>": 101163,
+  "<|LOC_867|>": 101164,
+  "<|LOC_868|>": 101165,
+  "<|LOC_869|>": 101166,
+  "<|LOC_86|>": 100383,
+  "<|LOC_870|>": 101167,
+  "<|LOC_871|>": 101168,
+  "<|LOC_872|>": 101169,
+  "<|LOC_873|>": 101170,
+  "<|LOC_874|>": 101171,
+  "<|LOC_875|>": 101172,
+  "<|LOC_876|>": 101173,
+  "<|LOC_877|>": 101174,
+  "<|LOC_878|>": 101175,
+  "<|LOC_879|>": 101176,
+  "<|LOC_87|>": 100384,
+  "<|LOC_880|>": 101177,
+  "<|LOC_881|>": 101178,
+  "<|LOC_882|>": 101179,
+  "<|LOC_883|>": 101180,
+  "<|LOC_884|>": 101181,
+  "<|LOC_885|>": 101182,
+  "<|LOC_886|>": 101183,
+  "<|LOC_887|>": 101184,
+  "<|LOC_888|>": 101185,
+  "<|LOC_889|>": 101186,
+  "<|LOC_88|>": 100385,
+  "<|LOC_890|>": 101187,
+  "<|LOC_891|>": 101188,
+  "<|LOC_892|>": 101189,
+  "<|LOC_893|>": 101190,
+  "<|LOC_894|>": 101191,
+  "<|LOC_895|>": 101192,
+  "<|LOC_896|>": 101193,
+  "<|LOC_897|>": 101194,
+  "<|LOC_898|>": 101195,
+  "<|LOC_899|>": 101196,
+  "<|LOC_89|>": 100386,
+  "<|LOC_8|>": 100305,
+  "<|LOC_900|>": 101197,
+  "<|LOC_901|>": 101198,
+  "<|LOC_902|>": 101199,
+  "<|LOC_903|>": 101200,
+  "<|LOC_904|>": 101201,
+  "<|LOC_905|>": 101202,
+  "<|LOC_906|>": 101203,
+  "<|LOC_907|>": 101204,
+  "<|LOC_908|>": 101205,
+  "<|LOC_909|>": 101206,
+  "<|LOC_90|>": 100387,
+  "<|LOC_910|>": 101207,
+  "<|LOC_911|>": 101208,
+  "<|LOC_912|>": 101209,
+  "<|LOC_913|>": 101210,
+  "<|LOC_914|>": 101211,
+  "<|LOC_915|>": 101212,
+  "<|LOC_916|>": 101213,
+  "<|LOC_917|>": 101214,
+  "<|LOC_918|>": 101215,
+  "<|LOC_919|>": 101216,
+  "<|LOC_91|>": 100388,
+  "<|LOC_920|>": 101217,
+  "<|LOC_921|>": 101218,
+  "<|LOC_922|>": 101219,
+  "<|LOC_923|>": 101220,
+  "<|LOC_924|>": 101221,
+  "<|LOC_925|>": 101222,
+  "<|LOC_926|>": 101223,
+  "<|LOC_927|>": 101224,
+  "<|LOC_928|>": 101225,
+  "<|LOC_929|>": 101226,
+  "<|LOC_92|>": 100389,
+  "<|LOC_930|>": 101227,
+  "<|LOC_931|>": 101228,
+  "<|LOC_932|>": 101229,
+  "<|LOC_933|>": 101230,
+  "<|LOC_934|>": 101231,
+  "<|LOC_935|>": 101232,
+  "<|LOC_936|>": 101233,
+  "<|LOC_937|>": 101234,
+  "<|LOC_938|>": 101235,
+  "<|LOC_939|>": 101236,
+  "<|LOC_93|>": 100390,
+  "<|LOC_940|>": 101237,
+  "<|LOC_941|>": 101238,
+  "<|LOC_942|>": 101239,
+  "<|LOC_943|>": 101240,
+  "<|LOC_944|>": 101241,
+  "<|LOC_945|>": 101242,
+  "<|LOC_946|>": 101243,
+  "<|LOC_947|>": 101244,
+  "<|LOC_948|>": 101245,
+  "<|LOC_949|>": 101246,
+  "<|LOC_94|>": 100391,
+  "<|LOC_950|>": 101247,
+  "<|LOC_951|>": 101248,
+  "<|LOC_952|>": 101249,
+  "<|LOC_953|>": 101250,
+  "<|LOC_954|>": 101251,
+  "<|LOC_955|>": 101252,
+  "<|LOC_956|>": 101253,
+  "<|LOC_957|>": 101254,
+  "<|LOC_958|>": 101255,
+  "<|LOC_959|>": 101256,
+  "<|LOC_95|>": 100392,
+  "<|LOC_960|>": 101257,
+  "<|LOC_961|>": 101258,
+  "<|LOC_962|>": 101259,
+  "<|LOC_963|>": 101260,
+  "<|LOC_964|>": 101261,
+  "<|LOC_965|>": 101262,
+  "<|LOC_966|>": 101263,
+  "<|LOC_967|>": 101264,
+  "<|LOC_968|>": 101265,
+  "<|LOC_969|>": 101266,
+  "<|LOC_96|>": 100393,
+  "<|LOC_970|>": 101267,
+  "<|LOC_971|>": 101268,
+  "<|LOC_972|>": 101269,
+  "<|LOC_973|>": 101270,
+  "<|LOC_974|>": 101271,
+  "<|LOC_975|>": 101272,
+  "<|LOC_976|>": 101273,
+  "<|LOC_977|>": 101274,
+  "<|LOC_978|>": 101275,
+  "<|LOC_979|>": 101276,
+  "<|LOC_97|>": 100394,
+  "<|LOC_980|>": 101277,
+  "<|LOC_981|>": 101278,
+  "<|LOC_982|>": 101279,
+  "<|LOC_983|>": 101280,
+  "<|LOC_984|>": 101281,
+  "<|LOC_985|>": 101282,
+  "<|LOC_986|>": 101283,
+  "<|LOC_987|>": 101284,
+  "<|LOC_988|>": 101285,
+  "<|LOC_989|>": 101286,
+  "<|LOC_98|>": 100395,
+  "<|LOC_990|>": 101287,
+  "<|LOC_991|>": 101288,
+  "<|LOC_992|>": 101289,
+  "<|LOC_993|>": 101290,
+  "<|LOC_994|>": 101291,
+  "<|LOC_995|>": 101292,
+  "<|LOC_996|>": 101293,
+  "<|LOC_997|>": 101294,
+  "<|LOC_998|>": 101295,
+  "<|LOC_999|>": 101296,
+  "<|LOC_99|>": 100396,
+  "<|LOC_9|>": 100306,
+  "<|LOC_BEGIN|>": 101298,
+  "<|LOC_END|>": 101299,
+  "<|LOC_SEP|>": 101300,
+  "<|image_pad|>": 101304,
+  "<|video_pad|>": 101307
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,46 @@

+{%- if not add_generation_prompt is defined -%}
+    {%- set add_generation_prompt = true -%}
+{%- endif -%}
+{%- if not cls_token is defined -%}
+    {%- set cls_token = "<|begin_of_sentence|>" -%}
+{%- endif -%}
+{%- if not eos_token is defined -%}
+    {%- set eos_token = "</s>" -%}
+{%- endif -%}
+{%- if not image_token is defined -%}
+    {%- set image_token = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" -%}
+{%- endif -%}
+{{- cls_token -}}
+{%- for message in messages -%}
+    {%- if message["role"] == "user" -%}
+        {{- "User: " -}}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "image" -%}
+                {{ image_token }}
+            {%- endif -%}
+        {%- endfor -%}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "text" -%}
+                {{ content["text"] }}
+            {%- endif -%}
+        {%- endfor -%}
+        {{ "\n" -}}
+    {%- elif message["role"] == "assistant" -%}
+        {{- "Assistant: " -}}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "text" -%}
+                {{ content["text"] }}
+            {%- endif -%}
+        {%- endfor -%}
+        {{ eos_token -}}
+    {%- elif message["role"] == "system" -%}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "text" -%}
+                {{ content["text"] + "\n" }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{- "Assistant: " -}}
+{%- endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+  "architectures": [
+    "PaddleOCRVLForConditionalGeneration"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
+    "AutoModel": "modeling_paddleocr_vl.PaddleOCRVLForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_paddleocr_vl.PaddleOCRVLForConditionalGeneration"
+  },
+  "compression_ratio": 1.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 1024,
+  "ignored_index": -100,
+  "image_token_id": 100295,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 131072,
+  "max_sequence_length": null,
+  "model_type": "paddleocr_vl",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 18,
+  "num_key_value_heads": 2,
+  "pad_token_id": 0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
+  "rope_theta": 500000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.0",
+  "use_bias": false,
+  "use_cache": false,
+  "use_flash_attention": false,
+  "video_token_id": 101307,
+  "vision_config": {
+    "architectures": [
+      "SiglipVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
+      "AutoModel": "modeling_paddleocr_vl.SiglipVisionModel"
+    },
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "paddleocr_vl",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "pad_token_id": 0,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2,
+    "tokens_per_second": 2,
+    "torch_dtype": "bfloat16"
+  },
+  "vision_start_token_id": 101305,
+  "vocab_size": 103424,
+  "weight_share_add_bias": true,
+  "use_3d_rope": true,
+  "rope_is_neox_style": true
+}

configuration_paddleocr_vl.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+class PaddleOCRVisionConfig(PretrainedConfig):
+    model_type = "paddleocr_vl"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        tokens_per_second=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.tokens_per_second = tokens_per_second
+class PaddleOCRVLConfig(PretrainedConfig):
+    """
+    Configuration class.
+    This class stores the configuration of an Ernie model, defining the model architecture.
+    It inherits from PretrainedConfig and can be used to control model outputs.
+    """
+    model_type = "paddleocr_vl"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    sub_configs = {"vision_config": PaddleOCRVisionConfig}
+    # Default tensor parallel plan for base model `Qwen3`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        intermediate_size=11008,
+        max_position_embeddings=32768,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        image_token_id=101304,
+        video_token_id=101305,
+        vision_start_token_id=101306,
+        rms_norm_eps=1e-6,
+        use_cache=False,
+        use_flash_attention=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        head_dim=128,
+        hidden_act="silu",
+        use_bias=False,
+        rope_theta=10000,
+        weight_share_add_bias=True,
+        ignored_index=-100,
+        attention_probs_dropout_prob=0.0,
+        hidden_dropout_prob=0.0,
+        compression_ratio: float = 1.0,
+        num_key_value_heads=None,
+        max_sequence_length=None,
+        tie_word_embeddings=False,
+        vision_config=None,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        """
+        Initialize configuration with default or specified parameters.
+        Args:
+            vocab_size (int): Size of the vocabulary (number of unique tokens)
+            hidden_size (int): Dimensionality of the encoder layers and the pooler layer
+            intermediate_size (int): Dimensionality of the "intermediate" (feed-forward) layer
+            max_position_embeddings (int): Maximum sequence length the model can handle
+            num_hidden_layers (int): Number of hidden layers in the Transformer encoder
+            num_attention_heads (int): Number of attention heads for each attention layer
+            rms_norm_eps (float): The epsilon used by the RMS normalization layers
+            use_cache (bool): Whether to use caching for faster generation (decoding)
+            use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
+            pad_token_id (int): Token ID used for padding sequences
+            bos_token_id (int): Token ID used for beginning-of-sequence
+            eos_token_id (int): Token ID used for end-of-sequence
+            use_bias (bool): Whether to use bias terms in linear layers
+            rope_theta (float): The base period of the RoPE embeddings
+            weight_share_add_bias (bool): Whether to share bias weights in certain layers
+            ignored_index (int): Target value that is ignored during loss computation
+            attention_probs_dropout_prob (float): Dropout probability for attention weights
+            hidden_dropout_prob (float): Dropout probability for hidden layers
+            compression_ratio (float): Ratio for KV cache compression (1.0 = no compression)
+            num_key_value_heads (int): Number of key/value heads (for Grouped Query Attention)
+            max_sequence_length (int): Maximum sequence length for positional embeddings
+            **kwargs: Additional keyword arguments passed to parent class
+        """
+        # Set default for tied embeddings if not specified.
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.use_flash_attention = use_flash_attention
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.head_dim = head_dim
+        self.hidden_act=hidden_act
+        self.sliding_window = None
+        self.hidden_size = hidden_size
+        self.use_bias = use_bias
+        self.weight_share_add_bias = weight_share_add_bias
+        self.rope_theta = rope_theta
+        self.ignored_index = ignored_index
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.compression_ratio = compression_ratio
+        self.num_key_value_heads = num_key_value_heads
+        self.max_sequence_length = max_sequence_length
+        self.rope_scaling = rope_scaling
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            if self.rope_scaling["type"] == "mrope":
+                self.rope_scaling["type"] = "default"
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self, ignore_keys={"mrope_section"})
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.55.0",
+  "use_cache": false
+}

image_processing.py ADDED Viewed

	@@ -0,0 +1,569 @@

+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for PaddleOCR-VL."""
+import math
+from typing import Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from torchvision.transforms import functional as TF
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    from PIL import Image
+ImageInput = Union[
+    "PIL.Image.Image",
+    np.ndarray,
+    "torch.Tensor",
+    List["PIL.Image.Image"],
+    List[np.ndarray],
+    List["torch.Tensor"],
+]  # noqa
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "torch.Tensor",
+    List["np.ndarray"],
+    List["torch.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarrray"]],
+    List[List["torch.Tensor"]],
+]  # noqa
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images.
+    """
+    if (
+        isinstance(images, (list, tuple))
+        and isinstance(images[0], (list, tuple))
+        and is_valid_image(images[0][0])
+    ):
+        return [img for img_list in images for img in img_list]
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+    elif is_valid_image(images):
+        return [images]
+    raise ValueError(f"Could not make batched images from {images}")
+def adjust_size(size, patch_size):
+    num_patches = size // patch_size
+    if num_patches % 2 != 0:  # 如果是奇数，减1
+        num_patches -= 1
+    return num_patches * patch_size
+def make_batched_videos(videos) -> List[VideoInput]:
+    if (
+        isinstance(videos, (list, tuple))
+        and isinstance(videos[0], (list, tuple))
+        and is_valid_image(videos[0][0])
+    ):
+        return videos
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
+    raise ValueError(f"Could not make batched video from {videos}")
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 28 * 28 * 130,
+    max_pixels: int = 28 * 28 * 1280,
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    # if height < factor or width < factor:
+    #    raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    # if int(height < factor//4) + int(width < factor//4):
+    #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
+    if height < factor:
+        print(f"smart_resize: height={height} < factor={factor}, reset height=factor")
+        width = round((width * factor) / height)
+        height = factor
+    if width < factor:
+        print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
+        height = round((height * factor) / width)
+        width = factor
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+class SiglipImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Siglip image processor that dynamically resizes images based on the original images.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `28 * 28 * 130`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1670`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The temporal patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+    model_input_names = [
+        "pixel_values",
+        "image_grid_thw",
+        "pixel_values_videos",
+        "video_grid_thw",
+    ]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 28 * 28 * 130,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_patch_size: int = 1,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}  # not used
+        self.do_convert_rgb = do_convert_rgb
+    def mvit_rescale(self, image: Image.Image, merge_size: int = 2) -> Image.Image:
+        try:
+            w, h = image.size
+        except:
+            raise ValueError(str((type(image), image)))
+        patch_size = self.patch_size
+        if (w // patch_size) * (h // patch_size) > self.in_token_limit:
+            scale = math.sqrt(
+                self.in_token_limit / ((w // patch_size) * (h // patch_size))
+            )
+            new_w, new_h = int(w * scale), int(h * scale)
+            image = image.resize((new_w, new_h), Image.Resampling.BICUBIC)
+        if self.pad_input:
+            new_w, new_h = image.size
+            pad_size_h = merge_size * patch_size
+            pad_size_w = merge_size * patch_size
+            pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
+            pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w
+            image = TF.pad(image, (0, 0, pad_w, pad_h))
+        else:
+            new_w, new_h = image.size
+            new_w = new_w - new_w % patch_size
+            new_h = new_h - new_h % patch_size
+            new_w = adjust_size(new_w, patch_size)
+            new_h = adjust_size(new_h, patch_size)
+            image = TF.center_crop(image, (new_h, new_w))
+        w, h = image.size
+        if w // patch_size >= 512 or h // patch_size >= 512:
+            new_h = min(patch_size * 510, h)
+            new_w = min(patch_size * 510, w)
+            image = TF.center_crop(image, (new_h, new_w))
+            # raise ValueError("Exceed pos emb")
+        return image
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * self.merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = resize(
+                    image,
+                    size=(resized_height, resized_width),
+                    resample=resample,
+                    input_data_format=input_data_format,
+                )
+            if do_rescale:
+                image = self.rescale(
+                    image, scale=rescale_factor, input_data_format=input_data_format
+                )
+            if do_normalize:
+                image = self.normalize(
+                    image=image,
+                    mean=image_mean,
+                    std=image_std,
+                    input_data_format=input_data_format,
+                )
+            image = to_channel_dimension_format(
+                image, data_format, input_channel_dim=input_data_format
+            )
+            processed_images.append(image)
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] == 1:
+            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        init_patches = patches
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h,
+            self.patch_size,
+            grid_w,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
+        assert self.temporal_patch_size == 1
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size
+        )
+        return flatten_patches, (grid_t, grid_h, grid_w)
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = (
+            rescale_factor if rescale_factor is not None else self.rescale_factor
+        )
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = (
+            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        )
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            videos = make_batched_videos(videos)
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+        if videos is not None:
+            pixel_values, vision_grid_thws = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(video_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {
+                "pixel_values_videos": pixel_values,
+                "video_grid_thw": vision_grid_thws,
+            }
+        return BatchFeature(data=data, tensor_type=return_tensors)

inference.yml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Global:
2	+ model_name: PaddleOCR-VL-0.9B

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3085f1042e184f68f8a412aa0f64f2c4b8562989598bbfba326aaa11fc685de8
+size 1917255968

modeling_paddleocr_vl.py ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing.SiglipImageProcessor",
+    "AutoProcessor": "processing_paddleocr_vl.PaddleOCRVLProcessor"
+  },
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "max_pixels": 2822400,
+  "merge_size": 2,
+  "min_pixels": 147384,
+  "patch_size": 14,
+  "processor_class": "PaddleOCRVLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "max_pixels": 2822400,
+    "min_pixels": 147384
+  },
+  "temporal_patch_size": 1
+}

processing_paddleocr_vl.py ADDED Viewed

	@@ -0,0 +1,293 @@

+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+import numpy as np
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import (
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+    VideosKwargs,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+ImageInput = Union[
+    "PIL.Image.Image",
+    np.ndarray,
+    "torch.Tensor",
+    List["PIL.Image.Image"],
+    List[np.ndarray],
+    List["torch.Tensor"],
+]  # noqa
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "torch.Tensor",
+    List["np.ndarray"],
+    List["torch.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarrray"]],
+    List[List["torch.Tensor"]],
+]  # noqa
+class PaddleOCRVLVideosProcessorKwargs(VideosKwargs, total=False):
+    fps: Union[List[float], float]
+class PaddleOCRVLProcessorKwargs(ProcessingKwargs, total=False):
+    videos_kwargs: PaddleOCRVLVideosProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "videos_kwargs": {"fps": 2.0},
+    }
+class PaddleOCRVLProcessor(ProcessorMixin):
+    r"""
+    [`PaddleOCRVLProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`Qwen2TokenizerFast`]. See the
+    [`~PaddleOCRVLProcessor.__call__`] and [`~PaddleOCRVLProcessor.decode`] for more information.
+    Args:
+        image_processor ([`SiglipImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = [
+        "chat_template",
+        "image_std",
+        "min_pixels",
+        "image_mean",
+        "merge_size",
+        "image_processor_type",
+        "temporal_patch_size",
+        "patch_size",
+        "max_pixels",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
+    ):
+        self.image_token = (
+            "<|IMAGE_PLACEHOLDER|>"
+            if not hasattr(tokenizer, "image_token")
+            else tokenizer.image_token
+        )
+        self.video_token = (
+            "<|video_pad|>"
+            if not hasattr(tokenizer, "video_token")
+            else tokenizer.video_token
+        )
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[
+            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
+        ] = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[PaddleOCRVLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `vision_infos` is not `None`.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+            - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            PaddleOCRVLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images=images, return_tensors="pt")
+            image_inputs["pixel_values"] = image_inputs["pixel_values"]
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+        if videos is not None:
+            # TODO: add video processing
+            videos_inputs = self.image_processor(
+                images=None, videos=videos, **output_kwargs["images_kwargs"]
+            )
+            video_grid_thw = videos_inputs["video_grid_thw"]
+            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
+            if isinstance(fps, (int, float)):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / fps
+                ] * len(video_grid_thw)
+            elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / tmp for tmp in fps
+                ]
+            else:
+                raise ValueError(
+                    f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
+                )
+            videos_inputs.update(
+                {"second_per_grid_ts": torch.tensor(second_per_grid_ts)}
+            )
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+        if not isinstance(text, list):
+            text = [text]
+        if image_grid_thw is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.image_token,
+                        "<|placeholder|>"
+                        * (
+                            image_grid_thw[index].prod()
+                            // self.image_processor.merge_size
+                            // self.image_processor.merge_size
+                        ),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+        if video_grid_thw is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.video_token,
+                        "<|placeholder|>"
+                        * (
+                            video_grid_thw[index].prod()
+                            // self.image_processor.merge_size
+                            // self.image_processor.merge_size
+                        ),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    def post_process_image_text_to_text(
+        self,
+        generated_outputs,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        names_from_processor = list(
+            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
+        )
+        return names_from_processor + ["second_per_grid_ts"]
+__all__ = ["PaddleOCRVLProcessor", "PaddleOCRVLProcessor"]

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_paddleocr_vl.PaddleOCRVLProcessor"
+  },
+  "processor_class": "PaddleOCRVLProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "additional_special_tokens": [
+    "<|IMAGE_PLACEHOLDER|>",
+    "<|image_pad|>",
+    "<|IMAGE_START|>",
+    "<|IMAGE_END|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<|begin_of_sentence|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask:1>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<|end_of_sentence|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f90f04fd8e5eb6dfa380f37d10c87392de8438dccb6768a2486b5a96ee76dba6
+size 11187679

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34ef7db83df785924fb83d7b887b6e822a031c56e15cff40aaf9b982988180df
+size 1614363

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff