Llama-2-70b-chatbot

Runtime error

App Files Files Community

Llama-2-70b-chatbot / app-org.py

bhaskartripathi

Update app-org.py

5323eae about 2 years ago

raw

history blame contribute delete

2.62 kB

	"""
	Try out gradio.Chatinterface.

	colab gradio-chatinterface.

	%%writefile reuirements.txt
	gradio
	transformers
	sentencepiece
	torch

	"""
	# pylint: disable=line-too-long, missing-module-docstring, missing-function-docstring
	# import torch
	from time import time

	import gradio as gr
	from about_time import about_time
	from examples_list import examples_list
	from transformers import AutoModel, AutoTokenizer # AutoModelForCausalLM,

	# device = "cuda" if torch.cuda.is_available() else "cpu"

	# tokenizer = AutoTokenizer.from_pretrained("stabilityai/StableBeluga2", use_fast=False)
	# model = AutoModelForCausalLM.from_pretrained("stabilityai/StableBeluga2", torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
	# system_prompt = "### System:\nYou are Stable Beluga, an AI that follows instructions extremely well. Help as much as you can. Remember, be safe, and don't do anything illegal.\n\n"
	# pipeline = pipeline(task="text-generation", model="meta-llama/Llama-2-7b")
	tokenizer = AutoTokenizer.from_pretrained(
	"THUDM/chatglm2-6b-int4", trust_remote_code=True
	)
	chat_model = AutoModel.from_pretrained(
	"THUDM/chatglm2-6b-int4", trust_remote_code=True # 3.92G
	).float()


	def chat(message, history):
	# prompt = f"{system_prompt}### User: {message}\n\n### Assistant:\n"
	# inputs = tokenizer(prompt, return_tensors="pt").to(device=device)
	# output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)
	# return tokenizer.decode(output[0], skip_special_tokens=True)
	flag = 1
	then = time()
	prefix = ""
	prelude = 0.0
	with about_time() as dur:
	for response, _ in chat_model.stream_chat(
	tokenizer, message, history, max_length=2048, top_p=0.7, temperature=0.95
	):
	if flag:
	flag = 0
	prelude = time() - then
	prefix = f"{prelude:.2f}s"
	yield f"{prefix} {response}"
	suffix = f"\n(time elapsed: {dur.duration_human}, {(time() - prelude)/len(response):.2f}s/char)"
	yield f"{response}{suffix}"

	chatbot = gr.Chatbot([], label="Bot", height=450)
	textbox = gr.Textbox('', scale=10, label='', lines=2, placeholder="Ask me anything")
	submit_btn = gr.Button(value="▶️ Send", scale=1, min_width=0, variant="primary")

	interf = gr.ChatInterface(
	chat,
	chatbot=chatbot,
	textbox=textbox,
	submit_btn=submit_btn,
	title="Llama-2-70b Locally Hosted",
	examples=examples_list,
	theme=gr.themes.Glass(text_size="sm", spacing_size="sm"),
	).queue(max_size=5)


	if __name__ == "__main__":
	interf.launch(debug=True)