Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| import tempfile | |
| from pathlib import Path | |
| import requests | |
| import base64 | |
| import re | |
| from typing import Tuple | |
| import markdown | |
| from dotenv import load_dotenv | |
| from openai import OpenAI | |
| from urllib.parse import urlparse | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| # API Configuration for PaddleOCR-VL | |
| API_URL = os.getenv("API_URL", "") | |
| TOKEN = os.getenv("TOKEN", "") | |
| class Doc2PageConverter: | |
| def __init__(self): | |
| self.qianfan_token = os.getenv('QIANFAN_TOKEN') | |
| self.qianfan_model = "ernie-x1.1-preview" | |
| self.client = None | |
| if self.qianfan_token: | |
| self.client = OpenAI( | |
| base_url="https://qianfan.baidubce.com/v2", | |
| api_key=self.qianfan_token | |
| ) | |
| def extract_text_with_vl_api(self, file_path: str) -> str: | |
| if not API_URL: | |
| raise ValueError("API_URL must be configured in .env file") | |
| headers = {"Content-Type": "application/json"} | |
| if TOKEN: | |
| headers["Authorization"] = f"bearer {TOKEN}" | |
| try: | |
| is_url = isinstance(file_path, str) and file_path.startswith(("http://", "https://")) | |
| if is_url: | |
| path = urlparse(file_path).path | |
| ext = os.path.splitext(path)[1].lower() | |
| else: | |
| ext = os.path.splitext(file_path)[1].lower() | |
| if ext == '.pdf': | |
| file_type = 0 # PDF 文件 | |
| elif ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']: | |
| file_type = 1 # 图片文件 | |
| else: | |
| raise ValueError(f"不支持的文件类型: '{ext}'") | |
| if is_url: | |
| response = requests.get(file_path, timeout=60) | |
| response.raise_for_status() | |
| content = response.content | |
| else: | |
| with open(file_path, "rb") as f: | |
| content = f.read() | |
| b64_content = base64.b64encode(content).decode("utf-8") | |
| except Exception as e: | |
| raise RuntimeError(f"读取和编码文件失败: {e}") | |
| payload = { | |
| "file": b64_content, | |
| "fileType": file_type, | |
| "useLayoutDetection": True, | |
| "useDocUnwarping": False, | |
| "useDocOrientationClassify": False, | |
| "useChartRecognition": False, | |
| } | |
| try: | |
| print(f"Sending PaddleOCR-VL API request to {API_URL}...") | |
| response = requests.post(API_URL, json=payload, headers=headers, timeout=300) | |
| response.raise_for_status() | |
| result_data = response.json() | |
| except requests.exceptions.RequestException as e: | |
| raise RuntimeError(f"PaddleOCR-VL API request failed: {e}") | |
| except json.JSONDecodeError: | |
| raise RuntimeError(f"Invalid JSON response from VL API: {response.text}") | |
| if result_data.get("errorCode", -1) != 0: | |
| error_msg = result_data.get("errorMessage", "Unknown API error") | |
| raise RuntimeError(f"PaddleOCR-VL API returned an error: {error_msg}") | |
| layout_results = result_data.get("result", {}).get("layoutParsingResults", []) | |
| if not layout_results: | |
| return "" | |
| first_page_result = layout_results[0] | |
| # print(first_page_result.get("prunedResult")) | |
| markdown_data = first_page_result.get("markdown", {}) | |
| full_markdown_text = markdown_data.get("text", "") | |
| image_map = markdown_data.get("images", {}) | |
| if image_map: | |
| for placeholder, real_url in image_map.items(): | |
| full_markdown_text = full_markdown_text.replace(f'src="{placeholder}"', f'src="{real_url}"') | |
| return full_markdown_text | |
| def markdown_to_html_with_ernie(self, markdown_text: str) -> str: | |
| """Convert markdown to HTML using ERNIE API. (No changes needed)""" | |
| if not self.client: | |
| return self.basic_markdown_to_html(markdown_text) | |
| try: | |
| prompt = f"""Please convert the following markdown text into a modern, clean HTML page. Use contemporary typography with the Inter font family and clean design principles. Make it visually appealing with proper CSS styling, responsive design, and excellent readability. | |
| Design requirements: | |
| - Use Inter font from Google Fonts | |
| - Clean, modern spacing and typography | |
| - Subtle shadows and rounded corners | |
| - Good color contrast and hierarchy | |
| - Responsive design that works on all devices | |
| - Include proper HTML structure with head, body, and semantic elements | |
| Important: Add a footer at the bottom with "Powered by PaddleOCR-VL and ERNIE" where PaddleOCR-VL links to https://github.com/PaddlePaddle/PaddleOCR and ERNIE links to https://huggingface.co/BAIDU. Style it with modern, subtle styling. | |
| Markdown content: | |
| {markdown_text} | |
| IMPORTANT: Return ONLY the raw HTML code starting with <!DOCTYPE html> and ending with </html>. Do NOT wrap it in markdown code blocks or add any explanations. I need the pure HTML content that can be directly saved as an .html file.""" | |
| messages = [{"role": "user", "content": prompt}] | |
| response = self.client.chat.completions.create( | |
| model=self.qianfan_model, | |
| messages=messages, | |
| max_tokens=64000, | |
| ) | |
| html_content = response.choices[0].message.content.strip() | |
| if html_content.startswith('```html'): | |
| html_content = html_content[7:] | |
| elif html_content.startswith('```'): | |
| html_content = html_content[3:] | |
| if html_content.endswith('```'): | |
| html_content = html_content[:-3] | |
| return html_content.strip() | |
| except Exception as e: | |
| print(f"Error calling ERNIE API: {e}") | |
| return self.basic_markdown_to_html(markdown_text) | |
| def basic_markdown_to_html(self, markdown_text: str) -> str: | |
| """Fallback markdown to HTML conversion. (No changes needed)""" | |
| html = markdown.markdown(markdown_text) | |
| complete_html = f""" | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Converted Document</title> | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); | |
| body {{ | |
| font-family: 'Inter', system-ui, sans-serif; line-height: 1.7; color: #1a1a1a; | |
| max-width: 850px; margin: 0 auto; padding: 32px 24px; background: #fafafa; | |
| }} | |
| .container {{ | |
| background: #ffffff; padding: 48px; border-radius: 12px; | |
| box-shadow: 0 1px 3px rgba(0,0,0,0.08), 0 4px 24px rgba(0,0,0,0.04); | |
| }} | |
| img {{ max-width: 100%; height: auto; border-radius: 8px; margin: 20px 0; }} | |
| .footer {{ | |
| margin-top: 64px; padding-top: 24px; border-top: 1px solid #e5e7eb; | |
| text-align: center; font-size: 14px; color: #6b7280; | |
| }} | |
| .footer a {{ color: #6366f1; text-decoration: none; }} | |
| .footer a:hover {{ text-decoration: underline; }} | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| {html} | |
| <div class="footer"> | |
| Powered by <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR-VL</a> and | |
| <a href="https://huggingface.co/BAIDU" target="_blank">ERNIE</a> | |
| </div> | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| return complete_html | |
| def process_document(self, file_path: str) -> Tuple[str, str]: | |
| """Process uploaded document and convert to HTML""" | |
| try: | |
| markdown_content = self.extract_text_with_vl_api(file_path) | |
| if not markdown_content.strip(): | |
| return ("Warning: No text content extracted from the document."), "" | |
| html_content = self.markdown_to_html_with_ernie(markdown_content) | |
| return markdown_content, html_content | |
| except Exception as e: | |
| return f"Error processing document: {str(e)}", "" | |
| # --- Gradio UI and event handling logic (unchanged) --- | |
| converter = Doc2PageConverter() | |
| def process_upload(file): | |
| if file is None: | |
| return "Please upload a file.", "", "" | |
| try: | |
| markdown_result, html_result = converter.process_document(file.name) | |
| if html_result: | |
| return "Document processed successfully!", markdown_result, html_result | |
| else: | |
| return markdown_result, "", "" | |
| except Exception as e: | |
| return f"Error: {str(e)}", "", "" | |
| def save_html_file(html_content, filename="converted_page"): | |
| if not html_content: | |
| return None | |
| temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, | |
| prefix=f"{filename}_") | |
| temp_file.write(html_content) | |
| temp_file.close() | |
| return temp_file.name | |
| custom_theme = gr.themes.Default( | |
| primary_hue="blue", secondary_hue="gray", neutral_hue="gray", | |
| font=("Inter", "system-ui", "sans-serif"), | |
| ).set( | |
| body_background_fill="#fafafa", background_fill_primary="#ffffff", | |
| border_color_primary="#e5e7eb", button_primary_background_fill="#6366f1", | |
| button_primary_background_fill_hover="#4f46e5", button_primary_text_color="#ffffff", | |
| ) | |
| with gr.Blocks( | |
| title="Doc2Page - Document to Webpage Converter", | |
| theme=custom_theme, | |
| css=".gradio-container { max-width: 1200px !important; margin: auto; }" | |
| ) as app: | |
| gr.Markdown("# Doc2Page\n🥃 Transform your documents into beautiful webpages!") | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=350): | |
| file_input = gr.File( | |
| label="📄 Upload Document", | |
| file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"], | |
| ) | |
| process_btn = gr.Button("✨ Convert to Webpage", variant="primary") | |
| status_output = gr.Textbox(label="Status", interactive=False) | |
| with gr.Column(scale=2): | |
| with gr.Tabs(): | |
| with gr.TabItem("❤️ Preview"): | |
| html_preview = gr.HTML(label="", value="<div style='text-align: center; color: #6b7280;'>Your converted webpage will appear here</div>") | |
| with gr.TabItem("📝 Markdown Source"): | |
| markdown_output = gr.Textbox(label="", interactive=False, show_copy_button=True) | |
| with gr.TabItem("🌐 HTML Source"): | |
| html_output = gr.Code(label="", language="html", interactive=False) | |
| with gr.Row(visible=False) as download_section: | |
| gr.Markdown("### 📥 Download Your Webpage") | |
| download_btn = gr.File(label="HTML File", visible=True) | |
| def process_and_update(file): | |
| status, markdown_content, html_content = process_upload(file) | |
| download_file = None | |
| show_download = False | |
| if html_content: | |
| filename = Path(file.name).stem if file else "converted_page" | |
| download_file = save_html_file(html_content, filename) | |
| show_download = True | |
| preview_content = html_content or "<div style='text-align: center; color: #9ca3af;'>No preview available</div>" | |
| return ( | |
| status, markdown_content, html_content, preview_content, | |
| download_file, gr.update(visible=show_download) | |
| ) | |
| process_btn.click( | |
| fn=process_and_update, | |
| inputs=[file_input], | |
| outputs=[status_output, markdown_output, html_output, html_preview, download_btn, download_section] | |
| ) | |
| gr.Markdown( | |
| """<div style="text-align: center; padding: 20px 0; color: #6b7280;"> | |
| Powered by <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR-VL</a> & | |
| <a href="https://huggingface.co/BAIDU" target="_blank">ERNIE</a> | |
| </div>""" | |
| ) | |
| if __name__ == "__main__": | |
| app.launch() | |