Felipe Silva commited on
Commit
fff96e7
·
1 Parent(s): 184667b

ajuste docling

Browse files
Files changed (3) hide show
  1. app.py +2 -2
  2. requirements.txt +2 -1
  3. utils.py +8 -1
app.py CHANGED
@@ -3,7 +3,7 @@ import spaces
3
  import torch
4
  import os
5
  from huggingface_hub import snapshot_download
6
- from utils import read_file_pdf, fix_type, extract_content_in_pdf, EXTENSIONS_FILES, EXTENSIONS_IMG_FILES
7
  from rag_utils import create_split_doc, store_docs, create_rag_chain
8
  import config
9
 
@@ -43,7 +43,7 @@ def process_file(file):
43
  if type_file in EXTENSIONS_FILES:
44
  texto_extraido = extract_content_in_pdf(file_obj)
45
  elif type_file in EXTENSIONS_IMG_FILES:
46
- texto_extraido = "OCR não implementado neste exemplo."
47
  return texto_extraido or "Não foi possível extrair texto."
48
 
49
  @spaces.GPU
 
3
  import torch
4
  import os
5
  from huggingface_hub import snapshot_download
6
+ from utils import read_file_pdf, fix_type, extract_content_in_pdf, doc_converter, EXTENSIONS_FILES, EXTENSIONS_IMG_FILES
7
  from rag_utils import create_split_doc, store_docs, create_rag_chain
8
  import config
9
 
 
43
  if type_file in EXTENSIONS_FILES:
44
  texto_extraido = extract_content_in_pdf(file_obj)
45
  elif type_file in EXTENSIONS_IMG_FILES:
46
+ texto_extraido = doc_converter(file)
47
  return texto_extraido or "Não foi possível extrair texto."
48
 
49
  @spaces.GPU
requirements.txt CHANGED
@@ -8,4 +8,5 @@ autoawq==0.2.9
8
  #intel_extension_for_pytorch==2.5.0
9
  optimum==1.27.0
10
  auto-gptq==0.7.1
11
- huggingface_hub
 
 
8
  #intel_extension_for_pytorch==2.5.0
9
  optimum==1.27.0
10
  auto-gptq==0.7.1
11
+ huggingface_hub
12
+ docling==2.52.0
utils.py CHANGED
@@ -1,6 +1,7 @@
1
  from PyPDF2 import PdfReader
2
  from PIL import Image
3
  from io import BytesIO
 
4
 
5
  EXTENSIONS_IMG_FILES = ['jpeg', 'jpg', 'png']
6
  EXTENSIONS_FILES = ['pdf']
@@ -12,10 +13,16 @@ MAX_IMAGE_SIZE = 2000 # pixels
12
  def fix_type(file_upload):
13
  type_file = file_upload.split('/')[-1].split('.')[-1]
14
  if type_file in EXTENSIONS_IMG_FILES:
15
- return read_file_img(file_upload), type_file
 
16
  elif type_file in EXTENSIONS_FILES:
17
  return read_file_pdf(file_upload), type_file
18
 
 
 
 
 
 
19
  # Resize image while maintaining aspect ratio
20
  def resize_image(image, max_size):
21
  width, height = image.size
 
1
  from PyPDF2 import PdfReader
2
  from PIL import Image
3
  from io import BytesIO
4
+ from docling.document_converter import DocumentConverter
5
 
6
  EXTENSIONS_IMG_FILES = ['jpeg', 'jpg', 'png']
7
  EXTENSIONS_FILES = ['pdf']
 
13
  def fix_type(file_upload):
14
  type_file = file_upload.split('/')[-1].split('.')[-1]
15
  if type_file in EXTENSIONS_IMG_FILES:
16
+ return None, type_file
17
+ # return read_file_img(file_upload), type_file
18
  elif type_file in EXTENSIONS_FILES:
19
  return read_file_pdf(file_upload), type_file
20
 
21
+ def doc_converter(file_path):
22
+ converter = DocumentConverter()
23
+ result = converter.convert(file_path)
24
+ return result.document.export_to_markdown()
25
+
26
  # Resize image while maintaining aspect ratio
27
  def resize_image(image, max_size):
28
  width, height = image.size