Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,141 Bytes
1c29543 826c551 1c29543 659a6e0 826c551 1c29543 826c551 1c29543 826c551 1c29543 826c551 1c29543 826c551 1c29543 826c551 1c29543 826c551 1c29543 fb0808b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import json
import gradio as gr
import spaces
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from transformers import AutoTokenizer
MAX_NEW_TOKENS = 8192
MODEL_NAME = "Azure99/Blossom-V6.2-36B"
MODEL_GGUF_REPO = f"{MODEL_NAME}-GGUF"
MODEL_FILE = "blossom-v6.2-36b-q8_0.gguf"
MODEL_LOCAL_DIR = "./"
hf_hub_download(repo_id=MODEL_GGUF_REPO, filename=MODEL_FILE, local_dir=MODEL_LOCAL_DIR)
llm: Llama = None
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def get_messages(user, history):
try:
parsed_body = json.loads(user)
if parsed_body.get("by_json_str"):
return parsed_body["messages"]
except:
pass
messages = []
messages.extend(history or [])
messages.append({"role": "user", "content": user})
return messages
@spaces.GPU(duration=120)
def chat(user, history, temperature, top_p, repetition_penalty):
global llm
if llm is None:
llm = Llama(
model_path=MODEL_FILE, n_gpu_layers=-1, flash_attn=True, n_ctx=16384
)
messages = get_messages(user, history)
print(f"Messages: {messages}")
input_ids = tokenizer.apply_chat_template(messages)
generate_config = dict(
temperature=temperature,
top_p=top_p,
repeat_penalty=repetition_penalty,
top_k=0,
stream=True,
max_tokens=MAX_NEW_TOKENS,
)
outputs = ""
for chunk in llm(input_ids, **generate_config):
outputs += chunk["choices"][0]["text"]
yield outputs
additional_inputs = [
gr.Slider(
label="Temperature",
value=0.5,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Controls randomness in choosing words.",
),
gr.Slider(
label="Top-P",
value=0.85,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Picks words until their combined probability is at least top_p.",
),
gr.Slider(
label="Repetition penalty",
value=1.05,
minimum=1.0,
maximum=1.2,
step=0.01,
interactive=True,
info="Repetition Penalty: Controls how much repetition is penalized.",
),
]
gr.ChatInterface(
chat,
type="messages",
chatbot=gr.Chatbot(
show_label=False,
height=500,
show_copy_button=True,
render_markdown=True,
type="messages",
latex_delimiters=[{"left": "\\[", "right": "\\]", "display": True}],
),
textbox=gr.Textbox(placeholder="", container=False, scale=7),
title=f"{MODEL_NAME} Demo",
description="Hello, I am Blossom, an open source conversational large language model.🌠"
'<a href="https://github.com/Azure99/BlossomLM">GitHub</a>',
theme="soft",
examples=[
["Hello"],
["What is MBTI"],
["用Python实现二分查找"],
["为switch写一篇小红书种草文案,带上emoji"],
],
cache_examples=False,
additional_inputs=additional_inputs,
additional_inputs_accordion=gr.Accordion(label="Config", open=True),
).queue().launch() |