Leri777's picture
Update app.py
c16db91 verified
raw
history blame
3.23 kB
import os
import logging
from threading import Thread
from logging.handlers import RotatingFileHandler
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from transformers import pipeline
# Logging setup
log_file = '/tmp/app_debug.log'
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
logger.debug("Application started")
# Define model parameters
MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
CONTEXT_LENGTH = 16000
# Configuration for 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", quantization_config=quantization_config)
# Create Hugging Face pipeline
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_length=CONTEXT_LENGTH,
temperature=0.7,
top_k=50,
top_p=0.9,
repetition_penalty=1.2,
)
# Initialize HuggingFacePipeline model for LangChain
chat_model = HuggingFacePipeline(pipeline=pipe)
logger.debug("Model and tokenizer loaded successfully")
# Define the conversation template for LangChain
template = """<|im_start|>system
{system_prompt}
<|im_end|>
{history}
<|im_start|>user
{human_input}
<|im_end|>
<|im_start|>assistant"""
# Create LangChain prompt and chain
prompt = PromptTemplate(template=template, input_variables=["system_prompt", "history", "human_input"])
chain = LLMChain(llm=chat_model, prompt=prompt)
# Format the conversation history
def format_history(history):
formatted = ""
for human, ai in history:
formatted += f"<|im_start|>user\n{human}\n<|im_end|>\n<|im_start|>assistant\n{ai}\n<|im_end|>\n"
return formatted
# Prediction function using LangChain and model
def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
formatted_history = format_history(history)
try:
result = chain.run({"system_prompt": system_prompt, "history": formatted_history, "human_input": message})
return result
except Exception as e:
logger.exception(f"Error during prediction: {e}")
return "An error occurred."
# Gradio UI
gr.Interface(
fn=predict,
inputs=[
gr.Textbox(label="User input"),
gr.State(),
gr.Textbox("You are a helpful coding assistant", label="System prompt"),
gr.Slider(0, 1, 0.7, label="Temperature"),
gr.Slider(128, 2048, 1024, label="Max new tokens"),
gr.Slider(1, 80, 40, label="Top K sampling"),
gr.Slider(0, 2, 1.1, label="Repetition penalty"),
gr.Slider(0, 1, 0.95, label="Top P sampling")
],
outputs="text",
title="Qwen2.5-Coder-7B-Instruct with LangChain",
live=True,
).launch()