Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import os | |
| import spaces | |
| MODEL_NAME = "swiss-ai/Apertus-8B-Instruct-2509" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Load model and tokenizer | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN) | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, token=HF_TOKEN).to(device) | |
| def predict(message, history): | |
| messages = [] | |
| # Add history to messages | |
| for user_msg, assistant_msg in history: | |
| messages.append({"role": "user", "content": user_msg}) | |
| messages.append({"role": "assistant", "content": assistant_msg}) | |
| # Add current message | |
| messages.append({"role": "user", "content": message}) | |
| # Apply chat template | |
| text = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| # Tokenize inputs | |
| model_inputs = tokenizer([text], return_tensors="pt", add_special_tokens=False).to(model.device) | |
| # Generate response | |
| generated_ids = model.generate(**model_inputs, max_new_tokens=1000) | |
| output_ids = generated_ids[0][len(model_inputs.input_ids[0]):] | |
| response = tokenizer.decode(output_ids, skip_special_tokens=True) | |
| return response | |
| # Create ChatInterface | |
| chatbot = gr.ChatInterface( | |
| predict, | |
| title="Apertus-8B Chatbot", | |
| description="Chat with the Apertus-8B-Instruct model. Enter your message and get a response.", | |
| examples=[ | |
| "Explain quantum computing in simple terms", | |
| "How do I make a sandwich?", | |
| "What is the capital of France?", | |
| "Write a short poem about the ocean" | |
| ] | |
| ) | |
| # Launch the app | |
| chatbot.launch() |