ibraheem007 commited on
Commit
aa4aa69
Β·
verified Β·
1 Parent(s): 6187f1d

Update utils/file_utils.py

Browse files
Files changed (1) hide show
  1. utils/file_utils.py +150 -5
utils/file_utils.py CHANGED
@@ -6,46 +6,91 @@ import logging
6
 
7
  logger = logging.getLogger(__name__)
8
 
 
 
 
9
  def extract_text_from_pdf(pdf_path):
10
- """Extracts text from PDF files - enhanced for Hugging Face"""
11
  logger.info(f"πŸ“„ Extracting text from PDF: {pdf_path}")
12
  try:
 
 
 
 
 
 
 
13
  if not os.path.exists(pdf_path):
14
  raise FileNotFoundError(f"File not found: {pdf_path}")
15
 
16
  doc = fitz.open(pdf_path)
17
  full_text = ""
 
 
 
18
  for page_num, page in enumerate(doc):
19
  page_text = page.get_text()
20
  full_text += page_text
21
- logger.debug(f"πŸ“„ Page {page_num + 1}: {len(page_text)} characters")
 
 
 
22
 
23
  doc.close()
 
 
 
 
 
24
  logger.info(f"βœ… PDF extraction complete: {len(full_text)} total characters")
25
  return full_text.strip()
 
 
 
 
26
  except Exception as e:
27
  logger.error(f"❌ PDF extraction failed: {e}")
 
 
 
28
  raise Exception(f"Failed to extract text from PDF: {str(e)}")
29
 
30
  def extract_text_from_pdf_bytes(pdf_bytes):
31
  """Extract text from PDF bytes without temp files"""
32
  try:
 
 
 
 
 
33
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
34
  full_text = ""
35
  for page in doc:
36
  full_text += page.get_text()
37
  doc.close()
 
 
 
 
38
  return full_text.strip()
 
 
 
39
  except Exception as e:
40
  logger.error(f"❌ PDF bytes extraction failed: {e}")
41
  raise
42
 
43
- # Keep your existing PPTX and DOCX functions as they are...
44
-
45
  def extract_text_from_pptx(pptx_path):
46
  """Extracts text from PowerPoint (PPTX) files."""
47
  logger.info(f"πŸ“Š Extracting text from PPTX: {pptx_path}")
48
  try:
 
 
 
 
 
 
 
49
  if not os.path.exists(pptx_path):
50
  raise FileNotFoundError(f"File not found: {pptx_path}")
51
 
@@ -57,8 +102,14 @@ def extract_text_from_pptx(pptx_path):
57
  full_text += shape.text + "\n"
58
  logger.debug(f"πŸ“Š Slide {slide_num + 1} processed")
59
 
 
 
 
60
  logger.info(f"βœ… PPTX extraction complete: {len(full_text)} total characters")
61
  return full_text.strip()
 
 
 
62
  except Exception as e:
63
  logger.error(f"❌ PPTX extraction failed: {e}")
64
  raise Exception(f"Failed to extract text from PowerPoint: {str(e)}")
@@ -67,13 +118,107 @@ def extract_text_from_docx(docx_path):
67
  """Extracts text from Word (DOCX) files."""
68
  logger.info(f"πŸ“ Extracting text from DOCX: {docx_path}")
69
  try:
 
 
 
 
 
 
 
70
  if not os.path.exists(docx_path):
71
  raise FileNotFoundError(f"File not found: {docx_path}")
72
 
73
  doc = Document(docx_path)
74
  full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
 
 
 
 
75
  logger.info(f"βœ… DOCX extraction complete: {len(full_text)} total characters")
76
  return full_text.strip()
 
 
 
77
  except Exception as e:
78
  logger.error(f"❌ DOCX extraction failed: {e}")
79
- raise Exception(f"Failed to extract text from Word document: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  logger = logging.getLogger(__name__)
8
 
9
+ # File size limits (in bytes)
10
+ MAX_PDF_SIZE = 50 * 1024 * 1024 # 50MB
11
+
12
  def extract_text_from_pdf(pdf_path):
13
+ """Extracts text from PDF files - enhanced for large files"""
14
  logger.info(f"πŸ“„ Extracting text from PDF: {pdf_path}")
15
  try:
16
+ # Check file size first
17
+ if os.path.exists(pdf_path):
18
+ file_size = os.path.getsize(pdf_path)
19
+ if file_size > MAX_PDF_SIZE:
20
+ file_size_mb = file_size / (1024 * 1024)
21
+ raise Exception(f"PDF file too large ({file_size_mb:.1f}MB). Maximum supported size is {MAX_PDF_SIZE // (1024*1024)}MB.")
22
+
23
  if not os.path.exists(pdf_path):
24
  raise FileNotFoundError(f"File not found: {pdf_path}")
25
 
26
  doc = fitz.open(pdf_path)
27
  full_text = ""
28
+ page_count = len(doc)
29
+
30
+ # For very large files, process with progress tracking
31
  for page_num, page in enumerate(doc):
32
  page_text = page.get_text()
33
  full_text += page_text
34
+
35
+ # Show progress for very large files
36
+ if page_count > 50 and page_num % 10 == 0:
37
+ logger.info(f"πŸ“„ Processed {page_num}/{page_count} pages...")
38
 
39
  doc.close()
40
+
41
+ if not full_text.strip():
42
+ logger.warning("⚠️ No text content found in PDF")
43
+ raise Exception("No extractable text found in PDF. The file may contain only images or scanned content.")
44
+
45
  logger.info(f"βœ… PDF extraction complete: {len(full_text)} total characters")
46
  return full_text.strip()
47
+
48
+ except MemoryError:
49
+ logger.error("❌ Memory error processing large PDF")
50
+ raise Exception("PDF is too large to process and caused memory issues. Please try a smaller file.")
51
  except Exception as e:
52
  logger.error(f"❌ PDF extraction failed: {e}")
53
+ # Re-raise with more context if it's a size issue
54
+ if "too large" in str(e).lower():
55
+ raise
56
  raise Exception(f"Failed to extract text from PDF: {str(e)}")
57
 
58
  def extract_text_from_pdf_bytes(pdf_bytes):
59
  """Extract text from PDF bytes without temp files"""
60
  try:
61
+ # Check size of bytes
62
+ if len(pdf_bytes) > MAX_PDF_SIZE:
63
+ file_size_mb = len(pdf_bytes) / (1024 * 1024)
64
+ raise Exception(f"PDF file too large ({file_size_mb:.1f}MB). Maximum supported size is {MAX_PDF_SIZE // (1024*1024)}MB.")
65
+
66
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
67
  full_text = ""
68
  for page in doc:
69
  full_text += page.get_text()
70
  doc.close()
71
+
72
+ if not full_text.strip():
73
+ raise Exception("No extractable text found in PDF.")
74
+
75
  return full_text.strip()
76
+ except MemoryError:
77
+ logger.error("❌ Memory error processing large PDF bytes")
78
+ raise Exception("PDF is too large to process. Please try a smaller file.")
79
  except Exception as e:
80
  logger.error(f"❌ PDF bytes extraction failed: {e}")
81
  raise
82
 
 
 
83
  def extract_text_from_pptx(pptx_path):
84
  """Extracts text from PowerPoint (PPTX) files."""
85
  logger.info(f"πŸ“Š Extracting text from PPTX: {pptx_path}")
86
  try:
87
+ # Check file size
88
+ if os.path.exists(pptx_path):
89
+ file_size = os.path.getsize(pptx_path)
90
+ if file_size > 50 * 1024 * 1024: # 50MB
91
+ file_size_mb = file_size / (1024 * 1024)
92
+ raise Exception(f"PPTX file too large ({file_size_mb:.1f}MB). Maximum supported size is 50MB.")
93
+
94
  if not os.path.exists(pptx_path):
95
  raise FileNotFoundError(f"File not found: {pptx_path}")
96
 
 
102
  full_text += shape.text + "\n"
103
  logger.debug(f"πŸ“Š Slide {slide_num + 1} processed")
104
 
105
+ if not full_text.strip():
106
+ raise Exception("No text content found in PowerPoint file.")
107
+
108
  logger.info(f"βœ… PPTX extraction complete: {len(full_text)} total characters")
109
  return full_text.strip()
110
+ except MemoryError:
111
+ logger.error("❌ Memory error processing large PPTX")
112
+ raise Exception("PowerPoint file is too large to process. Please try a smaller file.")
113
  except Exception as e:
114
  logger.error(f"❌ PPTX extraction failed: {e}")
115
  raise Exception(f"Failed to extract text from PowerPoint: {str(e)}")
 
118
  """Extracts text from Word (DOCX) files."""
119
  logger.info(f"πŸ“ Extracting text from DOCX: {docx_path}")
120
  try:
121
+ # Check file size
122
+ if os.path.exists(docx_path):
123
+ file_size = os.path.getsize(docx_path)
124
+ if file_size > 50 * 1024 * 1024: # 50MB
125
+ file_size_mb = file_size / (1024 * 1024)
126
+ raise Exception(f"DOCX file too large ({file_size_mb:.1f}MB). Maximum supported size is 50MB.")
127
+
128
  if not os.path.exists(docx_path):
129
  raise FileNotFoundError(f"File not found: {docx_path}")
130
 
131
  doc = Document(docx_path)
132
  full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
133
+
134
+ if not full_text.strip():
135
+ raise Exception("No text content found in Word document.")
136
+
137
  logger.info(f"βœ… DOCX extraction complete: {len(full_text)} total characters")
138
  return full_text.strip()
139
+ except MemoryError:
140
+ logger.error("❌ Memory error processing large DOCX")
141
+ raise Exception("Word document is too large to process. Please try a smaller file.")
142
  except Exception as e:
143
  logger.error(f"❌ DOCX extraction failed: {e}")
144
+ raise Exception(f"Failed to extract text from Word document: {str(e)}")
145
+
146
+ # import os
147
+ # import fitz # PyMuPDF
148
+ # from pptx import Presentation
149
+ # from docx import Document
150
+ # import logging
151
+
152
+ # logger = logging.getLogger(__name__)
153
+
154
+ # def extract_text_from_pdf(pdf_path):
155
+ # """Extracts text from PDF files - enhanced for Hugging Face"""
156
+ # logger.info(f"πŸ“„ Extracting text from PDF: {pdf_path}")
157
+ # try:
158
+ # if not os.path.exists(pdf_path):
159
+ # raise FileNotFoundError(f"File not found: {pdf_path}")
160
+
161
+ # doc = fitz.open(pdf_path)
162
+ # full_text = ""
163
+ # for page_num, page in enumerate(doc):
164
+ # page_text = page.get_text()
165
+ # full_text += page_text
166
+ # logger.debug(f"πŸ“„ Page {page_num + 1}: {len(page_text)} characters")
167
+
168
+ # doc.close()
169
+ # logger.info(f"βœ… PDF extraction complete: {len(full_text)} total characters")
170
+ # return full_text.strip()
171
+ # except Exception as e:
172
+ # logger.error(f"❌ PDF extraction failed: {e}")
173
+ # raise Exception(f"Failed to extract text from PDF: {str(e)}")
174
+
175
+ # def extract_text_from_pdf_bytes(pdf_bytes):
176
+ # """Extract text from PDF bytes without temp files"""
177
+ # try:
178
+ # doc = fitz.open(stream=pdf_bytes, filetype="pdf")
179
+ # full_text = ""
180
+ # for page in doc:
181
+ # full_text += page.get_text()
182
+ # doc.close()
183
+ # return full_text.strip()
184
+ # except Exception as e:
185
+ # logger.error(f"❌ PDF bytes extraction failed: {e}")
186
+ # raise
187
+
188
+ # # Keep your existing PPTX and DOCX functions as they are...
189
+
190
+ # def extract_text_from_pptx(pptx_path):
191
+ # """Extracts text from PowerPoint (PPTX) files."""
192
+ # logger.info(f"πŸ“Š Extracting text from PPTX: {pptx_path}")
193
+ # try:
194
+ # if not os.path.exists(pptx_path):
195
+ # raise FileNotFoundError(f"File not found: {pptx_path}")
196
+
197
+ # prs = Presentation(pptx_path)
198
+ # full_text = ""
199
+ # for slide_num, slide in enumerate(prs.slides):
200
+ # for shape in slide.shapes:
201
+ # if hasattr(shape, "text") and shape.text.strip():
202
+ # full_text += shape.text + "\n"
203
+ # logger.debug(f"πŸ“Š Slide {slide_num + 1} processed")
204
+
205
+ # logger.info(f"βœ… PPTX extraction complete: {len(full_text)} total characters")
206
+ # return full_text.strip()
207
+ # except Exception as e:
208
+ # logger.error(f"❌ PPTX extraction failed: {e}")
209
+ # raise Exception(f"Failed to extract text from PowerPoint: {str(e)}")
210
+
211
+ # def extract_text_from_docx(docx_path):
212
+ # """Extracts text from Word (DOCX) files."""
213
+ # logger.info(f"πŸ“ Extracting text from DOCX: {docx_path}")
214
+ # try:
215
+ # if not os.path.exists(docx_path):
216
+ # raise FileNotFoundError(f"File not found: {docx_path}")
217
+
218
+ # doc = Document(docx_path)
219
+ # full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
220
+ # logger.info(f"βœ… DOCX extraction complete: {len(full_text)} total characters")
221
+ # return full_text.strip()
222
+ # except Exception as e:
223
+ # logger.error(f"❌ DOCX extraction failed: {e}")
224
+ # raise Exception(f"Failed to extract text from Word document: {str(e)}")