import streamlit as st
import tempfile
import os
from utils.file_utils import extract_text_from_pdf, extract_text_from_pptx, extract_text_from_docx
import logging

# Set up logging
logger = logging.getLogger(__name__)

# File size limits (in bytes)
MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
RECOMMENDED_FILE_SIZE = 10 * 1024 * 1024  # 10MB

def process_uploaded_file(uploaded_file):
    """Process uploaded file with Hugging Face Spaces compatible approach"""
    logger.info(f"🔄 Starting file processing: {uploaded_file.name}")
    
    # Check file size first
    file_size = len(uploaded_file.getvalue())
    file_size_mb = file_size / (1024 * 1024)
    
    if file_size > MAX_FILE_SIZE:
        return None, f"📁 File too large ({file_size_mb:.1f}MB). Maximum file size is {MAX_FILE_SIZE // (1024*1024)}MB. Please use a smaller file or split your content."
    
    if file_size > RECOMMENDED_FILE_SIZE:
        st.warning(f"⚠️ Large file detected ({file_size_mb:.1f}MB). Processing may take longer. For best results, use files under {RECOMMENDED_FILE_SIZE // (1024*1024)}MB.")
    
    try:
        file_extension = uploaded_file.name.lower()
        file_content = uploaded_file.getvalue()
        
        logger.info(f"📁 Processing {file_extension} file, size: {file_size} bytes")
        
        # For Hugging Face Spaces, use BytesIO for everything
        from io import BytesIO
        
        if file_extension.endswith('.pdf'):
            logger.info("📄 Processing PDF with direct bytes...")
            try:
                # Try using PyMuPDF with bytes
                import fitz
                doc = fitz.open(stream=file_content, filetype="pdf")
                full_text = ""
                for page in doc:
                    full_text += page.get_text()
                doc.close()
                logger.info(f"✅ PDF processed: {len(full_text)} chars")
            except Exception as pdf_error:
                logger.error(f"❌ PDF bytes failed: {pdf_error}")
                # Fallback to very simple temp file approach
                return process_pdf_with_minimal_temp(uploaded_file)
                
        elif file_extension.endswith('.pptx'):
            logger.info("📊 Processing PPTX with BytesIO...")
            from pptx import Presentation
            pptx_file = BytesIO(file_content)
            prs = Presentation(pptx_file)
            full_text = ""
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text:
                        full_text += shape.text + "\n"
            full_text = full_text.strip()
            logger.info(f"✅ PPTX processed: {len(full_text)} chars")
            
        elif file_extension.endswith('.docx'):
            logger.info("📝 Processing DOCX with BytesIO...")
            from docx import Document
            docx_file = BytesIO(file_content)
            doc = Document(docx_file)
            full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
            full_text = full_text.strip()
            logger.info(f"✅ DOCX processed: {len(full_text)} chars")
            
        else:
            return None, "Unsupported file type. Please upload PDF, PPTX, or DOCX."

        if not full_text.strip():
            # More specific error message for different scenarios
            if file_size > 20 * 1024 * 1024:  # 20MB
                return None, "No text could be extracted from this large file. The file may contain mostly images or scanned content. Try a smaller file or ensure the file contains extractable text."
            elif file_size > 5 * 1024 * 1024:  # 5MB
                return None, "No text could be extracted. Large files often contain scanned images without OCR text. Try a smaller file or use OCR software first."
            else:
                return None, "No text could be extracted. The file might contain only images, be password protected, or have scanned content without OCR."
            
        return full_text, None
        
    except MemoryError:
        logger.error("❌ Memory error processing large file")
        return None, "File is too large to process and caused memory issues. Please try a smaller file (under 20MB recommended)."
    except Exception as e:
        logger.error(f"❌ File processing failed: {str(e)}")
        if "memory" in str(e).lower() or "large" in str(e).lower():
            return None, "File is too large to process. Please use a smaller file (under 20MB recommended)."
        return None, f"Error processing file: {str(e)}"

def process_pdf_with_minimal_temp(uploaded_file):
    """Minimal temp file approach for PDFs as last resort"""
    try:
        # Check file size first
        file_size = len(uploaded_file.getvalue())
        if file_size > MAX_FILE_SIZE:
            file_size_mb = file_size / (1024 * 1024)
            return None, f"File too large ({file_size_mb:.1f}MB). Maximum file size is {MAX_FILE_SIZE // (1024*1024)}MB."
            
        # Use Streamlit's temp file handling
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
            tmp_file.write(uploaded_file.getvalue())
            tmp_path = tmp_file.name
        
        # Extract text
        full_text = extract_text_from_pdf(tmp_path)
        
        # Immediate cleanup
        try:
            os.unlink(tmp_path)
        except:
            pass
            
        return full_text, None
        
    except MemoryError:
        return None, "File is too large to process. Please try a smaller file (under 20MB recommended)."
    except Exception as e:
        # Cleanup on error
        try:
            if 'tmp_path' in locals() and os.path.exists(tmp_path):
                os.unlink(tmp_path)
        except:
            pass
        
        if "too large" in str(e).lower():
            return None, str(e)
        return None, f"PDF processing failed: {str(e)}"

def get_student_content_input():
    """Get content input from student (file upload or text)"""
    st.subheader("📚 Provide Your Learning Material")
    content_source = st.radio(
        "How would you like to provide the content?",
        ["Upload File (PDF, PPTX, DOCX)", "Paste Text"],
        key="student_source"
    )
    
    content_text = ""
    filename = "simplified_content"
    
    if content_source == "Upload File (PDF, PPTX, DOCX)":
        # Show file size limits to user
        st.info(f"📁 **File Size Limits:** Maximum {MAX_FILE_SIZE // (1024*1024)}MB, recommended under {RECOMMENDED_FILE_SIZE // (1024*1024)}MB for best performance")
        
        uploaded_file = st.file_uploader(
            "Upload your course material", 
            type=["pdf", "pptx", "docx"],
            help="Upload lecture slides, textbook chapters (max 50MB)"
        )
        if uploaded_file:
            # Show file size info
            file_size = len(uploaded_file.getvalue())
            file_size_mb = file_size / (1024 * 1024)
            st.caption(f"📊 File size: {file_size_mb:.1f}MB")
            
            with st.spinner("📖 Reading your document..."):
                content_text, error = process_uploaded_file(uploaded_file)
                
            if error:
                if "too large" in error.lower():
                    st.error(f"❌ {error}")
                else:
                    st.error(f"❌ {error}")
                    # Show additional help for extraction failures
                    with st.expander("💡 Troubleshooting tips"):
                        st.write("""
                        **If text extraction fails:**
                        - Ensure the file contains selectable text (not just images/scans)
                        - Try a smaller file or extract specific sections
                        - For scanned documents, use OCR software first
                        - Check if the file is password protected
                        - Try converting to a different format
                        """)
            else:
                st.success("✅ Document processed successfully!")
                filename = uploaded_file.name
    else:
        content_text = st.text_area(
            "Paste the content you want to simplify:",
            height=200,
            placeholder="Paste complex textbook content, lecture notes, or any difficult learning material here..."
        )
    
    return content_text, filename

# import streamlit as st
# import tempfile
# import os
# from utils.file_utils import extract_text_from_pdf, extract_text_from_pptx, extract_text_from_docx
# import logging

# # Set up logging
# logger = logging.getLogger(__name__)

# def process_uploaded_file(uploaded_file):
#     """Process uploaded file with Hugging Face Spaces compatible approach"""
#     logger.info(f"🔄 Starting file processing: {uploaded_file.name}")
    
#     # Don't use temp files at all for Hugging Face Spaces
#     try:
#         file_extension = uploaded_file.name.lower()
#         file_content = uploaded_file.getvalue()
        
#         logger.info(f"📁 Processing {file_extension} file, size: {len(file_content)} bytes")
        
#         # For Hugging Face Spaces, use BytesIO for everything
#         from io import BytesIO
        
#         if file_extension.endswith('.pdf'):
#             # PDFs need temp files for PyMuPDF, but let's try a different approach
#             logger.info("📄 Processing PDF with direct bytes...")
#             try:
#                 # Try using PyMuPDF with bytes
#                 import fitz
#                 doc = fitz.open(stream=file_content, filetype="pdf")
#                 full_text = ""
#                 for page in doc:
#                     full_text += page.get_text()
#                 doc.close()
#                 logger.info(f"✅ PDF processed: {len(full_text)} chars")
#             except Exception as pdf_error:
#                 logger.error(f"❌ PDF bytes failed: {pdf_error}")
#                 # Fallback to very simple temp file approach
#                 return process_pdf_with_minimal_temp(uploaded_file)
                
#         elif file_extension.endswith('.pptx'):
#             logger.info("📊 Processing PPTX with BytesIO...")
#             from pptx import Presentation
#             pptx_file = BytesIO(file_content)
#             prs = Presentation(pptx_file)
#             full_text = ""
#             for slide in prs.slides:
#                 for shape in slide.shapes:
#                     if hasattr(shape, "text") and shape.text:
#                         full_text += shape.text + "\n"
#             full_text = full_text.strip()
#             logger.info(f"✅ PPTX processed: {len(full_text)} chars")
            
#         elif file_extension.endswith('.docx'):
#             logger.info("📝 Processing DOCX with BytesIO...")
#             from docx import Document
#             docx_file = BytesIO(file_content)
#             doc = Document(docx_file)
#             full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
#             full_text = full_text.strip()
#             logger.info(f"✅ DOCX processed: {len(full_text)} chars")
            
#         else:
#             return None, "Unsupported file type. Please upload PDF, PPTX, or DOCX."

#         if not full_text.strip():
#             return None, "No text could be extracted from the file."
            
#         return full_text, None
        
#     except Exception as e:
#         logger.error(f"❌ File processing failed: {str(e)}")
#         return None, f"Error processing file: {str(e)}"

# def process_pdf_with_minimal_temp(uploaded_file):
#     """Minimal temp file approach for PDFs as last resort"""
#     try:
#         # Use Streamlit's temp file handling
#         with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
#             tmp_file.write(uploaded_file.getvalue())
#             tmp_path = tmp_file.name
        
#         # Extract text
#         full_text = extract_text_from_pdf(tmp_path)
        
#         # Immediate cleanup
#         try:
#             os.unlink(tmp_path)
#         except:
#             pass
            
#         return full_text, None
        
#     except Exception as e:
#         # Cleanup on error
#         try:
#             if 'tmp_path' in locals() and os.path.exists(tmp_path):
#                 os.unlink(tmp_path)
#         except:
#             pass
#         return None, f"PDF processing failed: {str(e)}"

# def get_student_content_input():
#     """Get content input from student (file upload or text)"""
#     st.subheader("📚 Provide Your Learning Material")
#     content_source = st.radio(
#         "How would you like to provide the content?",
#         ["Upload File (PDF, PPTX, DOCX)", "Paste Text"],
#         key="student_source"
#     )
    
#     content_text = ""
#     filename = "simplified_content"
    
#     if content_source == "Upload File (PDF, PPTX, DOCX)":
#         uploaded_file = st.file_uploader(
#             "Upload your course material", 
#             type=["pdf", "pptx", "docx"],
#             help="Upload lecture slides, textbook chapters, or any difficult course material"
#         )
#         if uploaded_file:
#             with st.spinner("📖 Reading your document..."):
#                 # # Add debug info
#                 # st.write(f"📁 Testing file: {uploaded_file.name} ({len(uploaded_file.getvalue())} bytes)")
                
#                 content_text, error = process_uploaded_file(uploaded_file)
                
#                 if error:
#                     st.error(f"❌ {error}")
#                     # Show debug info
#                     with st.expander("🔧 Debug Info"):
#                         st.write(f"File type: {uploaded_file.type}")
#                         st.write(f"File size: {len(uploaded_file.getvalue())} bytes")
#                 else:
#                     st.success("✅ Document processed successfully!")
#                     filename = uploaded_file.name
#     else:
#         content_text = st.text_area(
#             "Paste the content you want to simplify:",
#             height=200,
#             placeholder="Paste complex textbook content, lecture notes, or any difficult learning material here..."
#         )
    
#     return content_text, filename