GeekBot / test /test1.py
abrar-adnan's picture
Initial commit
6acfeaf verified
from docling.document_converter import DocumentConverter
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import os
### πŸ”Ή Docling PDF Parsing
def parse_with_docling(pdf_path):
"""
Parses a PDF using Docling, extracts markdown content,
and prints the full extracted content.
"""
try:
# Ensure file exists
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"File not found: {pdf_path}")
# Initialize Docling Converter
converter = DocumentConverter()
markdown_document = converter.convert(pdf_path).document.export_to_markdown()
# Define headers to split on (modify as needed)
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
# Initialize Markdown Splitter
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
docs_list = markdown_splitter.split_text(markdown_document)
# Print full extracted sections
print("\nβœ… Full Extracted Content (Docling):")
for idx, doc in enumerate(docs_list):
print(f"\nπŸ”Ή Section {idx + 1}:\n{doc}\n" + "-"*80)
return docs_list
except Exception as e:
print(f"\n❌ Error during Docling processing: {e}")
return []
### πŸ”Ή LangChain PDF Parsing
def parse_with_langchain(pdf_path):
"""
Parses a PDF using LangChain's PyPDFLoader and prints the full extracted text.
"""
try:
# Ensure file exists
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"File not found: {pdf_path}")
# Load PDF using PyPDFLoader
loader = PyPDFLoader(pdf_path)
pages = loader.load()
# Extract text from all pages
text = "\n\n".join([page.page_content for page in pages])
# Print full extracted content
print("\nβœ… Full Extracted Content (LangChain):\n")
print(text)
print("\n" + "="*100)
return text
except Exception as e:
print(f"\n❌ Error during LangChain processing: {e}")
return ""
### πŸ”Ή Main Execution
def main():
ocr_path = "test/ocr_test.pdf"
scanned_pdf_path = "test/sample.png"
print("\nπŸ” Running Docling Extraction for OCR...")
docling_docs = parse_with_docling(ocr_path)
print("\nπŸ” Running LangChain Extraction for OCR...")
langchain_text = parse_with_langchain(ocr_path)
print("\nπŸ” Running Docling Extraction for scanned PDF...")
docling_docs = parse_with_docling(scanned_pdf_path)
print("\nπŸ” Running LangChain Extraction for scanned PDF...")
langchain_text = parse_with_langchain(scanned_pdf_path)
if __name__ == "__main__":
main()