Spaces:

PaddlePaddle
/

doc2page

Running

File size: 12,438 Bytes

import gradio as gr
import os
import tempfile
from pathlib import Path
import requests
import base64
import re
from typing import Tuple
import markdown
from dotenv import load_dotenv
from openai import OpenAI
from urllib.parse import urlparse

# Load environment variables from .env file
load_dotenv()

# API Configuration for PaddleOCR-VL
API_URL = os.getenv("API_URL", "")
TOKEN = os.getenv("TOKEN", "")


class Doc2PageConverter:
    def __init__(self):
        self.qianfan_token = os.getenv('QIANFAN_TOKEN')
        self.qianfan_model = "ernie-x1.1-preview"
        self.client = None
        
        if self.qianfan_token:
            self.client = OpenAI(
                base_url="https://qianfan.baidubce.com/v2",
                api_key=self.qianfan_token
            )
    def extract_text_with_vl_api(self, file_path: str) -> str:
        if not API_URL:
            raise ValueError("API_URL must be configured in .env file")

        headers = {"Content-Type": "application/json"}
        if TOKEN:
            headers["Authorization"] = f"bearer {TOKEN}"
        
        try:
            is_url = isinstance(file_path, str) and file_path.startswith(("http://", "https://"))

            if is_url:
                path = urlparse(file_path).path
                ext = os.path.splitext(path)[1].lower()
            else:
                ext = os.path.splitext(file_path)[1].lower()

            if ext == '.pdf':
                file_type = 0  # PDF 文件
            elif ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']:
                file_type = 1  # 图片文件
            else:
                raise ValueError(f"不支持的文件类型: '{ext}'")

            if is_url:
                response = requests.get(file_path, timeout=60)
                response.raise_for_status()
                content = response.content
            else:
                with open(file_path, "rb") as f:
                    content = f.read()

            b64_content = base64.b64encode(content).decode("utf-8")

        except Exception as e:
            raise RuntimeError(f"读取和编码文件失败: {e}")

        payload = {
            "file": b64_content,
            "fileType": file_type,  
            "useLayoutDetection": True,
            "useDocUnwarping": False,
            "useDocOrientationClassify": False,
            "useChartRecognition": False,
        }

        try:
            print(f"Sending PaddleOCR-VL API request to {API_URL}...")
            response = requests.post(API_URL, json=payload, headers=headers, timeout=300)
            response.raise_for_status()
            result_data = response.json()
        except requests.exceptions.RequestException as e:
            raise RuntimeError(f"PaddleOCR-VL API request failed: {e}")
        except json.JSONDecodeError:
            raise RuntimeError(f"Invalid JSON response from VL API: {response.text}")

        if result_data.get("errorCode", -1) != 0:
            error_msg = result_data.get("errorMessage", "Unknown API error")
            raise RuntimeError(f"PaddleOCR-VL API returned an error: {error_msg}")

        layout_results = result_data.get("result", {}).get("layoutParsingResults", [])
        if not layout_results:
            return ""

        first_page_result = layout_results[0]
    #    print(first_page_result.get("prunedResult"))
        markdown_data = first_page_result.get("markdown", {})
        
        full_markdown_text = markdown_data.get("text", "")
        image_map = markdown_data.get("images", {})

        if image_map:
            for placeholder, real_url in image_map.items():
                full_markdown_text = full_markdown_text.replace(f'src="{placeholder}"', f'src="{real_url}"')

        return full_markdown_text


    def markdown_to_html_with_ernie(self, markdown_text: str) -> str:
        """Convert markdown to HTML using ERNIE API. (No changes needed)"""
        if not self.client:
            return self.basic_markdown_to_html(markdown_text)
        
        try:
            prompt = f"""Please convert the following markdown text into a modern, clean HTML page. Use contemporary typography with the Inter font family and clean design principles. Make it visually appealing with proper CSS styling, responsive design, and excellent readability.
Design requirements:
- Use Inter font from Google Fonts
- Clean, modern spacing and typography  
- Subtle shadows and rounded corners
- Good color contrast and hierarchy
- Responsive design that works on all devices
- Include proper HTML structure with head, body, and semantic elements
Important: Add a footer at the bottom with "Powered by PaddleOCR-VL and ERNIE" where PaddleOCR-VL links to https://github.com/PaddlePaddle/PaddleOCR and ERNIE links to https://huggingface.co/BAIDU. Style it with modern, subtle styling.
Markdown content:
{markdown_text}
IMPORTANT: Return ONLY the raw HTML code starting with <!DOCTYPE html> and ending with </html>. Do NOT wrap it in markdown code blocks or add any explanations. I need the pure HTML content that can be directly saved as an .html file."""

            messages = [{"role": "user", "content": prompt}]
            
            response = self.client.chat.completions.create(
                model=self.qianfan_model,
                messages=messages,
                max_tokens=64000,
            )
            
            html_content = response.choices[0].message.content.strip()
            
            if html_content.startswith('```html'):
                html_content = html_content[7:]
            elif html_content.startswith('```'):
                html_content = html_content[3:]
                
            if html_content.endswith('```'):
                html_content = html_content[:-3]
                
            return html_content.strip()
                
        except Exception as e:
            print(f"Error calling ERNIE API: {e}")
            return self.basic_markdown_to_html(markdown_text)
    
    def basic_markdown_to_html(self, markdown_text: str) -> str:
        """Fallback markdown to HTML conversion. (No changes needed)"""
        html = markdown.markdown(markdown_text)
        
        complete_html = f"""
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>Converted Document</title>
            <style>
                @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
                body {{
                    font-family: 'Inter', system-ui, sans-serif; line-height: 1.7; color: #1a1a1a;
                    max-width: 850px; margin: 0 auto; padding: 32px 24px; background: #fafafa;
                }}
                .container {{
                    background: #ffffff; padding: 48px; border-radius: 12px;
                    box-shadow: 0 1px 3px rgba(0,0,0,0.08), 0 4px 24px rgba(0,0,0,0.04);
                }}
                img {{ max-width: 100%; height: auto; border-radius: 8px; margin: 20px 0; }}
                .footer {{
                    margin-top: 64px; padding-top: 24px; border-top: 1px solid #e5e7eb;
                    text-align: center; font-size: 14px; color: #6b7280;
                }}
                .footer a {{ color: #6366f1; text-decoration: none; }}
                .footer a:hover {{ text-decoration: underline; }}
            </style>
        </head>
        <body>
            <div class="container">
                {html}
                <div class="footer">
                    Powered by <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR-VL</a> and
                    <a href="https://huggingface.co/BAIDU" target="_blank">ERNIE</a>
                </div>
            </div>
        </body>
        </html>
        """
        return complete_html
    
    def process_document(self, file_path: str) -> Tuple[str, str]:
        """Process uploaded document and convert to HTML"""
        try:
            markdown_content = self.extract_text_with_vl_api(file_path)
            
            if not markdown_content.strip():
                return ("Warning: No text content extracted from the document."), ""
            
            html_content = self.markdown_to_html_with_ernie(markdown_content)
            
            return markdown_content, html_content
            
        except Exception as e:
            return f"Error processing document: {str(e)}", ""

# --- Gradio UI and event handling logic (unchanged) ---
converter = Doc2PageConverter()

def process_upload(file):
    if file is None:
        return "Please upload a file.", "", ""
    try:
        markdown_result, html_result = converter.process_document(file.name)
        if html_result:
            return "Document processed successfully!", markdown_result, html_result
        else:
            return markdown_result, "", ""
    except Exception as e:
        return f"Error: {str(e)}", "", ""

def save_html_file(html_content, filename="converted_page"):
    if not html_content:
        return None
    temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, 
                                          prefix=f"{filename}_")
    temp_file.write(html_content)
    temp_file.close()
    return temp_file.name

custom_theme = gr.themes.Default(
    primary_hue="blue", secondary_hue="gray", neutral_hue="gray",
    font=("Inter", "system-ui", "sans-serif"),
).set(
    body_background_fill="#fafafa", background_fill_primary="#ffffff",
    border_color_primary="#e5e7eb", button_primary_background_fill="#6366f1",
    button_primary_background_fill_hover="#4f46e5", button_primary_text_color="#ffffff",
)

with gr.Blocks(
    title="Doc2Page - Document to Webpage Converter", 
    theme=custom_theme,
    css=".gradio-container { max-width: 1200px !important; margin: auto; }"
) as app:
    
    gr.Markdown("# Doc2Page\n🥃 Transform your documents into beautiful webpages!")
    
    with gr.Row():
        with gr.Column(scale=1, min_width=350):
            file_input = gr.File(
                label="📄 Upload Document",
                file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"],
            )
            process_btn = gr.Button("✨ Convert to Webpage", variant="primary")
            status_output = gr.Textbox(label="Status", interactive=False)
        
        with gr.Column(scale=2):
            with gr.Tabs():
                with gr.TabItem("❤️ Preview"):
                    html_preview = gr.HTML(label="", value="<div style='text-align: center; color: #6b7280;'>Your converted webpage will appear here</div>")
                with gr.TabItem("📝 Markdown Source"):
                    markdown_output = gr.Textbox(label="", interactive=False, show_copy_button=True)
                with gr.TabItem("🌐 HTML Source"):
                    html_output = gr.Code(label="", language="html", interactive=False)
    
    with gr.Row(visible=False) as download_section:
        gr.Markdown("### 📥 Download Your Webpage")
        download_btn = gr.File(label="HTML File", visible=True)
    
    def process_and_update(file):
        status, markdown_content, html_content = process_upload(file)
        
        download_file = None
        show_download = False
        if html_content:
            filename = Path(file.name).stem if file else "converted_page"
            download_file = save_html_file(html_content, filename)
            show_download = True
        
        preview_content = html_content or "<div style='text-align: center; color: #9ca3af;'>No preview available</div>"
        
        return (
            status, markdown_content, html_content, preview_content,
            download_file, gr.update(visible=show_download)
        )
    
    process_btn.click(
        fn=process_and_update,
        inputs=[file_input],
        outputs=[status_output, markdown_output, html_output, html_preview, download_btn, download_section]
    )
    
    gr.Markdown(
        """<div style="text-align: center; padding: 20px 0; color: #6b7280;">
        Powered by <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR-VL</a> & 
        <a href="https://huggingface.co/BAIDU" target="_blank">ERNIE</a>
        </div>"""
    )

if __name__ == "__main__":
    app.launch()