File size: 2,992 Bytes
6acfeaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from docling.document_converter import DocumentConverter
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import os

### πŸ”Ή Docling PDF Parsing
def parse_with_docling(pdf_path):
    """

    Parses a PDF using Docling, extracts markdown content, 

    and prints the full extracted content.

    """
    try:
        # Ensure file exists
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"File not found: {pdf_path}")

        # Initialize Docling Converter
        converter = DocumentConverter()
        markdown_document = converter.convert(pdf_path).document.export_to_markdown()

        # Define headers to split on (modify as needed)
        headers_to_split_on = [
            ("#", "Header 1"),
            ("##", "Header 2"),
            ("###", "Header 3"),
        ]
        
        # Initialize Markdown Splitter
        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
        docs_list = markdown_splitter.split_text(markdown_document)

        # Print full extracted sections
        print("\nβœ… Full Extracted Content (Docling):")
        for idx, doc in enumerate(docs_list):
            print(f"\nπŸ”Ή Section {idx + 1}:\n{doc}\n" + "-"*80)

        return docs_list

    except Exception as e:
        print(f"\n❌ Error during Docling processing: {e}")
        return []

### πŸ”Ή LangChain PDF Parsing
def parse_with_langchain(pdf_path):
    """

    Parses a PDF using LangChain's PyPDFLoader and prints the full extracted text.

    """
    try:
        # Ensure file exists
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"File not found: {pdf_path}")

        # Load PDF using PyPDFLoader
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        # Extract text from all pages
        text = "\n\n".join([page.page_content for page in pages])

        # Print full extracted content
        print("\nβœ… Full Extracted Content (LangChain):\n")
        print(text)
        print("\n" + "="*100)

        return text

    except Exception as e:
        print(f"\n❌ Error during LangChain processing: {e}")
        return ""

### πŸ”Ή Main Execution
def main():
    ocr_path = "test/ocr_test.pdf"
    scanned_pdf_path = "test/sample.png"
    
    print("\nπŸ” Running Docling Extraction for OCR...")
    docling_docs = parse_with_docling(ocr_path)

    print("\nπŸ” Running LangChain Extraction for OCR...")
    langchain_text = parse_with_langchain(ocr_path)

    print("\nπŸ” Running Docling Extraction for scanned PDF...")
    docling_docs = parse_with_docling(scanned_pdf_path)

    print("\nπŸ” Running LangChain Extraction for scanned PDF...")
    langchain_text = parse_with_langchain(scanned_pdf_path)

if __name__ == "__main__":
    main()