Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,783 Bytes
e958943 ba5e93f e958943 ba5e93f e958943 68c1a70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import spaces
MODEL_NAME = "swiss-ai/Apertus-8B-Instruct-2509"
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load model and tokenizer
HF_TOKEN = os.getenv("HF_TOKEN")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, token=HF_TOKEN).to(device)
@spaces.GPU
def predict(message, history):
messages = []
# Add history to messages
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
# Add current message
messages.append({"role": "user", "content": message})
# Apply chat template
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
# Tokenize inputs
model_inputs = tokenizer([text], return_tensors="pt", add_special_tokens=False).to(model.device)
# Generate response
generated_ids = model.generate(**model_inputs, max_new_tokens=1000)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
response = tokenizer.decode(output_ids, skip_special_tokens=True)
return response
# Create ChatInterface
chatbot = gr.ChatInterface(
predict,
title="Apertus-8B Chatbot",
description="Chat with the Apertus-8B-Instruct model. Enter your message and get a response.",
examples=[
"Explain quantum computing in simple terms",
"How do I make a sandwich?",
"What is the capital of France?",
"Write a short poem about the ocean"
]
)
# Launch the app
chatbot.launch() |