from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig import torch, gradio as gr model_id = "sunhaonlp/Qwen2.5_7B_Instruct_ZeroSearch_wiki_V2" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float32 ) tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, quantization_config=bnb_config, device_map="auto", trust_remote_code=True ) model.eval() def build_prompt(history, query): messages = [{"role": "system", "content": "你是一个聪明幽默、知识全面的中文AI助手。"}] for user, bot in history: messages.append({"role": "user", "content": user}) messages.append({"role": "assistant", "content": bot}) messages.append({"role": "user", "content": query}) return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) def chat(query, history=[]): prompt = build_prompt(history, query) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) output = model.generate( **inputs, max_new_tokens=512, temperature=0.7, top_p=0.9, repetition_penalty=1.1, pad_token_id=tokenizer.eos_token_id ) result = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) history.append((query, result.strip())) return result.strip(), history with gr.Blocks() as demo: gr.Markdown("## 🧠 Qwen2.5 中文助手(CPU 免费版)") chatbot = gr.Chatbot() msg = gr.Textbox(placeholder="来问我点什么吧,比如:美股怎么开户?") state = gr.State([]) send = gr.Button("发送") def user_input(user_msg, history): reply, history = chat(user_msg, history) return history + [(user_msg, reply)], history send.click(user_input, [msg, state], [chatbot, state]) msg.submit(user_input, [msg, state], [chatbot, state]) demo.launch()