QWEN-2.5-Coder-7B

Sleeping

App Files Files Community

QWEN-2.5-Coder-7B / app.py

Leri777

Update app.py

c16db91 verified about 1 year ago

raw

history blame

3.23 kB

	import os
	import logging
	from threading import Thread
	from logging.handlers import RotatingFileHandler
	import torch
	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from langchain_huggingface import HuggingFacePipeline
	from langchain.prompts import PromptTemplate
	from langchain.chains import LLMChain
	from transformers import pipeline

	# Logging setup
	log_file = '/tmp/app_debug.log'
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.DEBUG)
	file_handler = RotatingFileHandler(log_file, maxBytes=1010241024, backupCount=5)
	file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
	logger.addHandler(file_handler)

	logger.debug("Application started")

	# Define model parameters
	MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
	CONTEXT_LENGTH = 16000

	# Configuration for 4-bit quantization
	quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)

	# Load tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", quantization_config=quantization_config)

	# Create Hugging Face pipeline
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_length=CONTEXT_LENGTH,
	temperature=0.7,
	top_k=50,
	top_p=0.9,
	repetition_penalty=1.2,
	)

	# Initialize HuggingFacePipeline model for LangChain
	chat_model = HuggingFacePipeline(pipeline=pipe)

	logger.debug("Model and tokenizer loaded successfully")

	# Define the conversation template for LangChain
	template = """<\|im_start\|>system
	{system_prompt}
	<\|im_end\|>
	{history}
	<\|im_start\|>user
	{human_input}
	<\|im_end\|>
	<\|im_start\|>assistant"""

	# Create LangChain prompt and chain
	prompt = PromptTemplate(template=template, input_variables=["system_prompt", "history", "human_input"])
	chain = LLMChain(llm=chat_model, prompt=prompt)

	# Format the conversation history
	def format_history(history):
	formatted = ""
	for human, ai in history:
	formatted += f"<\|im_start\|>user\n{human}\n<\|im_end\|>\n<\|im_start\|>assistant\n{ai}\n<\|im_end\|>\n"
	return formatted

	# Prediction function using LangChain and model
	def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
	formatted_history = format_history(history)

	try:
	result = chain.run({"system_prompt": system_prompt, "history": formatted_history, "human_input": message})
	return result
	except Exception as e:
	logger.exception(f"Error during prediction: {e}")
	return "An error occurred."

	# Gradio UI
	gr.Interface(
	fn=predict,
	inputs=[
	gr.Textbox(label="User input"),
	gr.State(),
	gr.Textbox("You are a helpful coding assistant", label="System prompt"),
	gr.Slider(0, 1, 0.7, label="Temperature"),
	gr.Slider(128, 2048, 1024, label="Max new tokens"),
	gr.Slider(1, 80, 40, label="Top K sampling"),
	gr.Slider(0, 2, 1.1, label="Repetition penalty"),
	gr.Slider(0, 1, 0.95, label="Top P sampling")
	],
	outputs="text",
	title="Qwen2.5-Coder-7B-Instruct with LangChain",
	live=True,
	).launch()