Spaces:
Running
on
Zero
Running
on
Zero
Felipe Silva
commited on
Commit
·
fff96e7
1
Parent(s):
184667b
ajuste docling
Browse files- app.py +2 -2
- requirements.txt +2 -1
- utils.py +8 -1
app.py
CHANGED
|
@@ -3,7 +3,7 @@ import spaces
|
|
| 3 |
import torch
|
| 4 |
import os
|
| 5 |
from huggingface_hub import snapshot_download
|
| 6 |
-
from utils import read_file_pdf, fix_type, extract_content_in_pdf, EXTENSIONS_FILES, EXTENSIONS_IMG_FILES
|
| 7 |
from rag_utils import create_split_doc, store_docs, create_rag_chain
|
| 8 |
import config
|
| 9 |
|
|
@@ -43,7 +43,7 @@ def process_file(file):
|
|
| 43 |
if type_file in EXTENSIONS_FILES:
|
| 44 |
texto_extraido = extract_content_in_pdf(file_obj)
|
| 45 |
elif type_file in EXTENSIONS_IMG_FILES:
|
| 46 |
-
texto_extraido =
|
| 47 |
return texto_extraido or "Não foi possível extrair texto."
|
| 48 |
|
| 49 |
@spaces.GPU
|
|
|
|
| 3 |
import torch
|
| 4 |
import os
|
| 5 |
from huggingface_hub import snapshot_download
|
| 6 |
+
from utils import read_file_pdf, fix_type, extract_content_in_pdf, doc_converter, EXTENSIONS_FILES, EXTENSIONS_IMG_FILES
|
| 7 |
from rag_utils import create_split_doc, store_docs, create_rag_chain
|
| 8 |
import config
|
| 9 |
|
|
|
|
| 43 |
if type_file in EXTENSIONS_FILES:
|
| 44 |
texto_extraido = extract_content_in_pdf(file_obj)
|
| 45 |
elif type_file in EXTENSIONS_IMG_FILES:
|
| 46 |
+
texto_extraido = doc_converter(file)
|
| 47 |
return texto_extraido or "Não foi possível extrair texto."
|
| 48 |
|
| 49 |
@spaces.GPU
|
requirements.txt
CHANGED
|
@@ -8,4 +8,5 @@ autoawq==0.2.9
|
|
| 8 |
#intel_extension_for_pytorch==2.5.0
|
| 9 |
optimum==1.27.0
|
| 10 |
auto-gptq==0.7.1
|
| 11 |
-
huggingface_hub
|
|
|
|
|
|
| 8 |
#intel_extension_for_pytorch==2.5.0
|
| 9 |
optimum==1.27.0
|
| 10 |
auto-gptq==0.7.1
|
| 11 |
+
huggingface_hub
|
| 12 |
+
docling==2.52.0
|
utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from PyPDF2 import PdfReader
|
| 2 |
from PIL import Image
|
| 3 |
from io import BytesIO
|
|
|
|
| 4 |
|
| 5 |
EXTENSIONS_IMG_FILES = ['jpeg', 'jpg', 'png']
|
| 6 |
EXTENSIONS_FILES = ['pdf']
|
|
@@ -12,10 +13,16 @@ MAX_IMAGE_SIZE = 2000 # pixels
|
|
| 12 |
def fix_type(file_upload):
|
| 13 |
type_file = file_upload.split('/')[-1].split('.')[-1]
|
| 14 |
if type_file in EXTENSIONS_IMG_FILES:
|
| 15 |
-
return
|
|
|
|
| 16 |
elif type_file in EXTENSIONS_FILES:
|
| 17 |
return read_file_pdf(file_upload), type_file
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# Resize image while maintaining aspect ratio
|
| 20 |
def resize_image(image, max_size):
|
| 21 |
width, height = image.size
|
|
|
|
| 1 |
from PyPDF2 import PdfReader
|
| 2 |
from PIL import Image
|
| 3 |
from io import BytesIO
|
| 4 |
+
from docling.document_converter import DocumentConverter
|
| 5 |
|
| 6 |
EXTENSIONS_IMG_FILES = ['jpeg', 'jpg', 'png']
|
| 7 |
EXTENSIONS_FILES = ['pdf']
|
|
|
|
| 13 |
def fix_type(file_upload):
|
| 14 |
type_file = file_upload.split('/')[-1].split('.')[-1]
|
| 15 |
if type_file in EXTENSIONS_IMG_FILES:
|
| 16 |
+
return None, type_file
|
| 17 |
+
# return read_file_img(file_upload), type_file
|
| 18 |
elif type_file in EXTENSIONS_FILES:
|
| 19 |
return read_file_pdf(file_upload), type_file
|
| 20 |
|
| 21 |
+
def doc_converter(file_path):
|
| 22 |
+
converter = DocumentConverter()
|
| 23 |
+
result = converter.convert(file_path)
|
| 24 |
+
return result.document.export_to_markdown()
|
| 25 |
+
|
| 26 |
# Resize image while maintaining aspect ratio
|
| 27 |
def resize_image(image, max_size):
|
| 28 |
width, height = image.size
|