import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer import os import spaces MODEL_NAME = "swiss-ai/Apertus-8B-Instruct-2509" device = "cuda" if torch.cuda.is_available() else "cpu" # Load model and tokenizer HF_TOKEN = os.getenv("HF_TOKEN") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, token=HF_TOKEN).to(device) @spaces.GPU def predict(message, history): messages = [] # Add history to messages for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": assistant_msg}) # Add current message messages.append({"role": "user", "content": message}) # Apply chat template text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) # Tokenize inputs model_inputs = tokenizer([text], return_tensors="pt", add_special_tokens=False).to(model.device) # Generate response generated_ids = model.generate(**model_inputs, max_new_tokens=1000) output_ids = generated_ids[0][len(model_inputs.input_ids[0]):] response = tokenizer.decode(output_ids, skip_special_tokens=True) return response # Create ChatInterface chatbot = gr.ChatInterface( predict, title="Apertus-8B Chatbot", description="Chat with the Apertus-8B-Instruct model. Enter your message and get a response.", examples=[ "Explain quantum computing in simple terms", "How do I make a sandwich?", "What is the capital of France?", "Write a short poem about the ocean" ] ) # Launch the app chatbot.launch()