Spaces:

FelipeErmeson
/

projeto-rag

Running on Zero

Felipe Silva commited on Sep 13

Commit

fff96e7

1 Parent(s): 184667b

ajuste docling

Files changed (3) hide show

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import spaces
 import torch
 import os
 from huggingface_hub import snapshot_download
-from utils import read_file_pdf, fix_type, extract_content_in_pdf, EXTENSIONS_FILES, EXTENSIONS_IMG_FILES
 from rag_utils import create_split_doc, store_docs, create_rag_chain
 import config
@@ -43,7 +43,7 @@ def process_file(file):
     if type_file in EXTENSIONS_FILES:
         texto_extraido = extract_content_in_pdf(file_obj)
     elif type_file in EXTENSIONS_IMG_FILES:
-        texto_extraido = "OCR não implementado neste exemplo."
     return texto_extraido or "Não foi possível extrair texto."
 @spaces.GPU

 import torch
 import os
 from huggingface_hub import snapshot_download
+from utils import read_file_pdf, fix_type, extract_content_in_pdf, doc_converter, EXTENSIONS_FILES, EXTENSIONS_IMG_FILES
 from rag_utils import create_split_doc, store_docs, create_rag_chain
 import config
     if type_file in EXTENSIONS_FILES:
         texto_extraido = extract_content_in_pdf(file_obj)
     elif type_file in EXTENSIONS_IMG_FILES:
+        texto_extraido = doc_converter(file)
     return texto_extraido or "Não foi possível extrair texto."
 @spaces.GPU

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ autoawq==0.2.9
 #intel_extension_for_pytorch==2.5.0
 optimum==1.27.0
 auto-gptq==0.7.1
-huggingface_hub

 #intel_extension_for_pytorch==2.5.0
 optimum==1.27.0
 auto-gptq==0.7.1
+huggingface_hub
+docling==2.52.0

utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from PyPDF2 import PdfReader
 from PIL import Image
 from io import BytesIO
 EXTENSIONS_IMG_FILES = ['jpeg', 'jpg', 'png']
 EXTENSIONS_FILES = ['pdf']
@@ -12,10 +13,16 @@ MAX_IMAGE_SIZE = 2000  # pixels
 def fix_type(file_upload):
     type_file = file_upload.split('/')[-1].split('.')[-1]
     if type_file in EXTENSIONS_IMG_FILES:
-        return read_file_img(file_upload), type_file
     elif type_file in EXTENSIONS_FILES:
         return read_file_pdf(file_upload), type_file
 # Resize image while maintaining aspect ratio
 def resize_image(image, max_size):
     width, height = image.size

 from PyPDF2 import PdfReader
 from PIL import Image
 from io import BytesIO
+from docling.document_converter import DocumentConverter
 EXTENSIONS_IMG_FILES = ['jpeg', 'jpg', 'png']
 EXTENSIONS_FILES = ['pdf']
 def fix_type(file_upload):
     type_file = file_upload.split('/')[-1].split('.')[-1]
     if type_file in EXTENSIONS_IMG_FILES:
+        return None, type_file
+        # return read_file_img(file_upload), type_file
     elif type_file in EXTENSIONS_FILES:
         return read_file_pdf(file_upload), type_file
+def doc_converter(file_path):
+    converter = DocumentConverter()
+    result = converter.convert(file_path)
+    return result.document.export_to_markdown()
 # Resize image while maintaining aspect ratio
 def resize_image(image, max_size):
     width, height = image.size