Update README.md (#4)
Browse files- Update README.md (9f57da6408d7a06a3d96c3491b069365b0fc8e0f)
Co-authored-by: jujeongho <[email protected]>
README.md
CHANGED
|
@@ -122,7 +122,7 @@ conversation = [
|
|
| 122 |
"role": "user",
|
| 123 |
"content": [
|
| 124 |
{"type": "image", "url": "https://huggingface.co/NCSOFT/VARCO-VISION-2.0-14B/resolve/main/demo.jpg"},
|
| 125 |
-
{"type": "text", "text": "
|
| 126 |
],
|
| 127 |
},
|
| 128 |
]
|
|
@@ -175,6 +175,50 @@ print(output)
|
|
| 175 |
```
|
| 176 |
</details>
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
<details>
|
| 179 |
<summary>OCR inference</summary>
|
| 180 |
|
|
|
|
| 122 |
"role": "user",
|
| 123 |
"content": [
|
| 124 |
{"type": "image", "url": "https://huggingface.co/NCSOFT/VARCO-VISION-2.0-14B/resolve/main/demo.jpg"},
|
| 125 |
+
{"type": "text", "text": "각 박스마다 한 줄씩 색상과 글자를 정확하게 출력해주세요."},
|
| 126 |
],
|
| 127 |
},
|
| 128 |
]
|
|
|
|
| 175 |
```
|
| 176 |
</details>
|
| 177 |
|
| 178 |
+
<details>
|
| 179 |
+
<summary>Batch inference</summary>
|
| 180 |
+
|
| 181 |
+
All inputs in a batch must have the same modality structure—for example, text-only with text-only, single-image with single-image, and multi-image inputs with the same number of images—to ensure correct batch inference.
|
| 182 |
+
|
| 183 |
+
```python
|
| 184 |
+
conversation_1 = [
|
| 185 |
+
{
|
| 186 |
+
"role": "user",
|
| 187 |
+
"content": [
|
| 188 |
+
{"type": "image", "image": "file:///path/to/image1.jpg"},
|
| 189 |
+
{"type": "text", "text": "이미지를 설명해주세요."},
|
| 190 |
+
],
|
| 191 |
+
},
|
| 192 |
+
]
|
| 193 |
+
|
| 194 |
+
conversation_2 = [
|
| 195 |
+
{
|
| 196 |
+
"role": "user",
|
| 197 |
+
"content": [
|
| 198 |
+
{"type": "image", "image": "file:///path/to/image2.jpg"},
|
| 199 |
+
{"type": "text", "text": "이 이미지에 표시된 것은 무엇인가요?"},
|
| 200 |
+
],
|
| 201 |
+
},
|
| 202 |
+
]
|
| 203 |
+
|
| 204 |
+
inputs = processor.apply_chat_template(
|
| 205 |
+
[conversation_1, conversation_2],
|
| 206 |
+
add_generation_prompt=True,
|
| 207 |
+
tokenize=True,
|
| 208 |
+
return_dict=True,
|
| 209 |
+
padding=True,
|
| 210 |
+
return_tensors="pt"
|
| 211 |
+
).to(model.device, torch.float16)
|
| 212 |
+
|
| 213 |
+
generate_ids = model.generate(**inputs, max_new_tokens=1024)
|
| 214 |
+
generate_ids_trimmed = [
|
| 215 |
+
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generate_ids)
|
| 216 |
+
]
|
| 217 |
+
output = processor.batch_decode(generate_ids_trimmed, skip_special_tokens=True)
|
| 218 |
+
print(output)
|
| 219 |
+
```
|
| 220 |
+
</details>
|
| 221 |
+
|
| 222 |
<details>
|
| 223 |
<summary>OCR inference</summary>
|
| 224 |
|