ibraheem007 commited on
Commit
31a0760
·
verified ·
1 Parent(s): 871f67b

Create utils/file_utils.py

Browse files
Files changed (1) hide show
  1. utils/file_utils.py +38 -0
utils/file_utils.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF
3
+ from pptx import Presentation
4
+ from docx import Document
5
+
6
+ def extract_text_from_pdf(pdf_path):
7
+ """Extracts text from PDF files."""
8
+ if not os.path.exists(pdf_path):
9
+ raise FileNotFoundError(f"File not found: {pdf_path}")
10
+
11
+ doc = fitz.open(pdf_path)
12
+ full_text = ""
13
+ for page in doc:
14
+ full_text += page.get_text()
15
+ doc.close()
16
+ return full_text.strip()
17
+
18
+ def extract_text_from_pptx(pptx_path):
19
+ """Extracts text from PowerPoint (PPTX) files."""
20
+ if not os.path.exists(pptx_path):
21
+ raise FileNotFoundError(f"File not found: {pptx_path}")
22
+
23
+ prs = Presentation(pptx_path)
24
+ full_text = ""
25
+ for slide in prs.slides:
26
+ for shape in slide.shapes:
27
+ if hasattr(shape, "text"):
28
+ full_text += shape.text + "\n"
29
+ return full_text.strip()
30
+
31
+ def extract_text_from_docx(docx_path):
32
+ """Extracts text from Word (DOCX) files."""
33
+ if not os.path.exists(docx_path):
34
+ raise FileNotFoundError(f"File not found: {docx_path}")
35
+
36
+ doc = Document(docx_path)
37
+ full_text = "\n".join([para.text for para in doc.paragraphs])
38
+ return full_text.strip()