import os import logging from threading import Thread from logging.handlers import RotatingFileHandler import torch import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from langchain_huggingface import HuggingFacePipeline from langchain.prompts import PromptTemplate from langchain.chains import LLMChain from transformers import pipeline # Logging setup log_file = '/tmp/app_debug.log' logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) file_handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5) file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logger.addHandler(file_handler) logger.debug("Application started") # Define model parameters MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct" CONTEXT_LENGTH = 16000 # Configuration for 4-bit quantization quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16 ) # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", quantization_config=quantization_config, trust_remote_code=True, ) # Create Hugging Face pipeline pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_length=CONTEXT_LENGTH, temperature=0.7, top_k=50, top_p=0.9, repetition_penalty=1.2, ) # Initialize HuggingFacePipeline model for LangChain chat_model = HuggingFacePipeline(pipeline=pipe) logger.debug("Model and tokenizer loaded successfully") # Define the conversation template for LangChain template = """<|im_start|>system {system_prompt} <|im_end|> {history} <|im_start|>user {human_input} <|im_end|> <|im_start|>assistant""" # Create LangChain prompt and chain prompt = PromptTemplate( template=template, input_variables=["system_prompt", "history", "human_input"] ) chain = LLMChain(llm=chat_model, prompt=prompt) # Format the conversation history def format_history(history): formatted = "" for human, ai in history: formatted += f"<|im_start|>user\n{human}\n<|im_end|>\n<|im_start|>assistant\n{ai}\n<|im_end|>\n" return formatted # Prediction function using LangChain and model def predict( message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p, ): formatted_history = format_history(history) try: result = chain.run( { "system_prompt": system_prompt, "history": formatted_history, "human_input": message, } ) return result except Exception as e: logger.exception(f"Error during prediction: {e}") return "An error occurred." # Gradio UI gr.Interface( fn=predict, inputs=[ gr.Textbox(label="User input"), gr.State(), gr.Textbox("You are a helpful coding assistant", label="System prompt"), gr.Slider(0, 1, 0.7, label="Temperature"), gr.Slider(128, 2048, 1024, label="Max new tokens"), gr.Slider(1, 80, 40, label="Top K sampling"), gr.Slider(0, 2, 1.1, label="Repetition penalty"), gr.Slider(0, 1, 0.95, label="Top P sampling"), ], outputs="text", title="Qwen2.5-Coder-7B-Instruct with LangChain", live=True, ).launch()