Spaces:
Runtime error
Runtime error
| """ | |
| Try out gradio.Chatinterface. | |
| colab gradio-chatinterface. | |
| %%writefile reuirements.txt | |
| gradio | |
| transformers | |
| sentencepiece | |
| torch | |
| """ | |
| # pylint: disable=line-too-long, missing-module-docstring, missing-function-docstring | |
| # import torch | |
| from time import time | |
| import gradio as gr | |
| from about_time import about_time | |
| from examples_list import examples_list | |
| from transformers import AutoModel, AutoTokenizer # AutoModelForCausalLM, | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # tokenizer = AutoTokenizer.from_pretrained("stabilityai/StableBeluga2", use_fast=False) | |
| # model = AutoModelForCausalLM.from_pretrained("stabilityai/StableBeluga2", torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto") | |
| # system_prompt = "### System:\nYou are Stable Beluga, an AI that follows instructions extremely well. Help as much as you can. Remember, be safe, and don't do anything illegal.\n\n" | |
| # pipeline = pipeline(task="text-generation", model="meta-llama/Llama-2-7b") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "THUDM/chatglm2-6b-int4", trust_remote_code=True | |
| ) | |
| chat_model = AutoModel.from_pretrained( | |
| "THUDM/chatglm2-6b-int4", trust_remote_code=True # 3.92G | |
| ).float() | |
| def chat(message, history): | |
| # prompt = f"{system_prompt}### User: {message}\n\n### Assistant:\n" | |
| # inputs = tokenizer(prompt, return_tensors="pt").to(device=device) | |
| # output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256) | |
| # return tokenizer.decode(output[0], skip_special_tokens=True) | |
| flag = 1 | |
| then = time() | |
| prefix = "" | |
| prelude = 0.0 | |
| with about_time() as dur: | |
| for response, _ in chat_model.stream_chat( | |
| tokenizer, message, history, max_length=2048, top_p=0.7, temperature=0.95 | |
| ): | |
| if flag: | |
| flag = 0 | |
| prelude = time() - then | |
| prefix = f"{prelude:.2f}s" | |
| yield f"{prefix} {response}" | |
| suffix = f"\n(time elapsed: {dur.duration_human}, {(time() - prelude)/len(response):.2f}s/char)" | |
| yield f"{response}{suffix}" | |
| chatbot = gr.Chatbot([], label="Bot", height=450) | |
| textbox = gr.Textbox('', scale=10, label='', lines=2, placeholder="Ask me anything") | |
| submit_btn = gr.Button(value="▶️ Send", scale=1, min_width=0, variant="primary") | |
| interf = gr.ChatInterface( | |
| chat, | |
| chatbot=chatbot, | |
| textbox=textbox, | |
| submit_btn=submit_btn, | |
| title="Llama-2-70b Locally Hosted", | |
| examples=examples_list, | |
| theme=gr.themes.Glass(text_size="sm", spacing_size="sm"), | |
| ).queue(max_size=5) | |
| if __name__ == "__main__": | |
| interf.launch(debug=True) | |