import os
import logging
from threading import Thread
from logging.handlers import RotatingFileHandler
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from transformers import pipeline

# Logging setup
log_file = '/tmp/app_debug.log'
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)

logger.debug("Application started")

# Define model parameters
MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
CONTEXT_LENGTH = 16000

# Configuration for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    quantization_config=quantization_config,
    trust_remote_code=True,
)

# Create Hugging Face pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=CONTEXT_LENGTH,
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    repetition_penalty=1.2,
)

# Initialize HuggingFacePipeline model for LangChain
chat_model = HuggingFacePipeline(pipeline=pipe)

logger.debug("Model and tokenizer loaded successfully")

# Define the conversation template for LangChain
template = """<|im_start|>system
{system_prompt}
<|im_end|>
{history}
<|im_start|>user
{human_input}
<|im_end|>
<|im_start|>assistant"""

# Create LangChain prompt and chain
prompt = PromptTemplate(
    template=template, input_variables=["system_prompt", "history", "human_input"]
)
chain = LLMChain(llm=chat_model, prompt=prompt)

# Format the conversation history
def format_history(history):
    formatted = ""
    for human, ai in history:
        formatted += f"<|im_start|>user\n{human}\n<|im_end|>\n<|im_start|>assistant\n{ai}\n<|im_end|>\n"
    return formatted

# Prediction function using LangChain and model
def predict(
    message,
    history,
    system_prompt,
    temperature,
    max_new_tokens,
    top_k,
    repetition_penalty,
    top_p,
):
    formatted_history = format_history(history)

    try:
        result = chain.run(
            {
                "system_prompt": system_prompt,
                "history": formatted_history,
                "human_input": message,
            }
        )
        return result
    except Exception as e:
        logger.exception(f"Error during prediction: {e}")
        return "An error occurred."

# Gradio UI
gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(label="User input"),
        gr.State(),
        gr.Textbox("You are a helpful coding assistant", label="System prompt"),
        gr.Slider(0, 1, 0.7, label="Temperature"),
        gr.Slider(128, 2048, 1024, label="Max new tokens"),
        gr.Slider(1, 80, 40, label="Top K sampling"),
        gr.Slider(0, 2, 1.1, label="Repetition penalty"),
        gr.Slider(0, 1, 0.95, label="Top P sampling"),
    ],
    outputs="text",
    title="Qwen2.5-Coder-7B-Instruct with LangChain",
    live=True,
).launch()