Spaces:
Sleeping
Sleeping
| import os | |
| import logging | |
| from threading import Thread | |
| from logging.handlers import RotatingFileHandler | |
| import torch | |
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| from langchain_huggingface import HuggingFacePipeline | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains import LLMChain | |
| from transformers import pipeline | |
| # Logging setup | |
| log_file = '/tmp/app_debug.log' | |
| logger = logging.getLogger(__name__) | |
| logger.setLevel(logging.DEBUG) | |
| file_handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5) | |
| file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) | |
| logger.addHandler(file_handler) | |
| logger.debug("Application started") | |
| # Define model parameters | |
| MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct" | |
| CONTEXT_LENGTH = 16000 | |
| # Configuration for 4-bit quantization | |
| quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) | |
| # Load tokenizer and model | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", quantization_config=quantization_config) | |
| # Create Hugging Face pipeline | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_length=CONTEXT_LENGTH, | |
| temperature=0.7, | |
| top_k=50, | |
| top_p=0.9, | |
| repetition_penalty=1.2, | |
| ) | |
| # Initialize HuggingFacePipeline model for LangChain | |
| chat_model = HuggingFacePipeline(pipeline=pipe) | |
| logger.debug("Model and tokenizer loaded successfully") | |
| # Define the conversation template for LangChain | |
| template = """<|im_start|>system | |
| {system_prompt} | |
| <|im_end|> | |
| {history} | |
| <|im_start|>user | |
| {human_input} | |
| <|im_end|> | |
| <|im_start|>assistant""" | |
| # Create LangChain prompt and chain | |
| prompt = PromptTemplate(template=template, input_variables=["system_prompt", "history", "human_input"]) | |
| chain = LLMChain(llm=chat_model, prompt=prompt) | |
| # Format the conversation history | |
| def format_history(history): | |
| formatted = "" | |
| for human, ai in history: | |
| formatted += f"<|im_start|>user\n{human}\n<|im_end|>\n<|im_start|>assistant\n{ai}\n<|im_end|>\n" | |
| return formatted | |
| # Prediction function using LangChain and model | |
| def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p): | |
| formatted_history = format_history(history) | |
| try: | |
| result = chain.run({"system_prompt": system_prompt, "history": formatted_history, "human_input": message}) | |
| return result | |
| except Exception as e: | |
| logger.exception(f"Error during prediction: {e}") | |
| return "An error occurred." | |
| # Gradio UI | |
| gr.Interface( | |
| fn=predict, | |
| inputs=[ | |
| gr.Textbox(label="User input"), | |
| gr.State(), | |
| gr.Textbox("You are a helpful coding assistant", label="System prompt"), | |
| gr.Slider(0, 1, 0.7, label="Temperature"), | |
| gr.Slider(128, 2048, 1024, label="Max new tokens"), | |
| gr.Slider(1, 80, 40, label="Top K sampling"), | |
| gr.Slider(0, 2, 1.1, label="Repetition penalty"), | |
| gr.Slider(0, 1, 0.95, label="Top P sampling") | |
| ], | |
| outputs="text", | |
| title="Qwen2.5-Coder-7B-Instruct with LangChain", | |
| live=True, | |
| ).launch() | |