ykvns commited on
Commit
ab0b22d
Β·
verified Β·
1 Parent(s): 80b70e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -67
app.py CHANGED
@@ -1,97 +1,110 @@
1
  import gradio as gr
2
- from llama_cpp import Llama
3
- from transformers import AutoTokenizer
4
- from huggingface_hub import hf_hub_download
5
- import os
6
 
7
- # Model paths
8
  MODEL_REPO_ID = "LGAI-EXAONE/EXAONE-4.0-1.2B"
9
- GGUF_REPO_ID = "lmstudio-community/LGAI-EXAONE-4.0-1.2B-GGUF"
10
- GGUF_FILENAME = "A.X-4.0-Light-imatrix-IQ1_S.gguf"
11
- TOKENIZER_DIR = "exaone-tokenizer"
12
- TEMPLATE_PATH = os.path.join(TOKENIZER_DIR, "chat_template.jinja")
13
 
14
- # Download GGUF model
15
- print("πŸ”„ Downloading GGUF model...")
16
- model_path = hf_hub_download(repo_id=GGUF_REPO_ID, filename=GGUF_FILENAME)
17
- print(f"βœ… Model downloaded to: {model_path}")
18
-
19
- # Load tokenizer
20
- print("πŸ”„ Loading tokenizer...")
21
- tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
22
- print("βœ… Tokenizer loaded.")
23
 
 
24
  try:
25
- with open(TEMPLATE_PATH, "r", encoding="utf-8") as f:
26
- tokenizer.chat_template = f.read()
27
- print("βœ… Chat template loaded.")
 
 
 
 
 
 
 
 
 
 
28
  except Exception as e:
29
- print(f"Could not load chat_template.jinja: {e}")
30
- tokenizer.chat_template = None
31
-
32
- # Load model
33
- llm = Llama(
34
- model_path=model_path,
35
- n_ctx=2048,
36
- n_threads=os.cpu_count(),
37
- n_gpu_layers=-1,
38
- use_mlock=True,
39
- verbose=False
40
- )
41
-
42
- # Streaming chat function
43
- def format_prompt(messages):
44
- formatted = ""
45
- for m in messages:
46
- role = m["role"].upper()
47
- formatted += f"{role}: {m['content']}\n"
48
- return formatted + "ASSISTANT:"
49
 
 
50
  def user_input_handler(user_message, history):
 
51
  return "", history + [[user_message, None]]
52
 
53
  def bot_stream(history):
 
 
 
 
 
 
54
  user_message = history[-1][0]
55
- history[-1][1] = ""
56
 
57
- # Convert chat history to OpenAI format
58
- messages = [{"role": "system", "content": "You are a helpful assistant."}]
59
- for human, assistant in history[:-1]:
60
  messages.append({"role": "user", "content": human})
61
- if assistant:
62
  messages.append({"role": "assistant", "content": assistant})
63
  messages.append({"role": "user", "content": user_message})
64
 
65
  try:
 
66
  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
67
- except Exception:
68
- prompt = format_prompt(messages)
 
 
 
 
 
 
 
69
 
70
- # Generate streaming output
71
- generator = llm.create_completion(
72
- prompt=prompt,
73
- max_tokens=512,
 
 
 
 
74
  temperature=0.7,
75
  top_p=0.9,
76
- stream=True,
77
- stop=["</s>", "<|endoftext|>", "USER:", "ASSISTANT:"]
78
  )
79
 
80
- for chunk in generator:
81
- token = chunk["choices"][0]["text"]
 
 
 
 
82
  history[-1][1] += token
83
  yield history
84
 
85
- # Gradio UI
86
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
87
- gr.Markdown("## πŸ€– EXAONE-4.0-1.2B Streaming")
88
- chatbot = gr.Chatbot(label="Chat History", height=600)
 
 
89
  with gr.Row():
90
- msg = gr.Textbox(placeholder="Type a message...", label="Your Message", scale=8)
91
- send_btn = gr.Button("Send", scale=1)
92
- clear_btn = gr.Button("Clear Chat", scale=1, variant="secondary")
 
 
 
 
93
 
94
- # Chat events
 
 
95
  msg.submit(user_input_handler, [msg, chatbot], [msg, chatbot], queue=False).then(
96
  bot_stream, chatbot, chatbot
97
  )
@@ -99,7 +112,5 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
99
  bot_stream, chatbot, chatbot
100
  )
101
 
102
- clear_btn.click(lambda: [], None, chatbot)
103
-
104
  demo.queue()
105
- demo.launch()
 
1
  import gradio as gr
2
+ import torch
3
+ from threading import Thread
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
5
 
6
+ # Model and Tokenizer Configuration
7
  MODEL_REPO_ID = "LGAI-EXAONE/EXAONE-4.0-1.2B"
 
 
 
 
8
 
9
+ print("βœ… Starting application...")
 
 
 
 
 
 
 
 
10
 
11
+ # Load the model with bfloat16 to save memory
12
  try:
13
+ print(f"πŸ”„ Loading tokenizer from '{MODEL_REPO_ID}'...")
14
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO_ID)
15
+ print("βœ… Tokenizer loaded successfully.")
16
+
17
+ print(f"πŸ”„ Loading model '{MODEL_REPO_ID}' with torch_dtype=torch.bfloat16...")
18
+
19
+ model = AutoModelForCausalLM.from_pretrained(
20
+ MODEL_REPO_ID,
21
+ torch_dtype=torch.bfloat16,
22
+ device_map="auto"
23
+ )
24
+ print("βœ… Model loaded successfully.")
25
+
26
  except Exception as e:
27
+ print(f"❌ Error loading model or tokenizer: {e}")
28
+ # Exit if model fails to load, as the app is unusable.
29
+ raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # Streaming Chat Function
32
  def user_input_handler(user_message, history):
33
+ """Handles user input by appending it to the history."""
34
  return "", history + [[user_message, None]]
35
 
36
  def bot_stream(history):
37
+ """
38
+ Generates the bot's response using a streaming approach.
39
+ This function runs the model in a separate thread to avoid blocking the UI.
40
+ """
41
+ print(f"πŸ“ History received: {history}")
42
+ # The last message is the user's prompt.
43
  user_message = history[-1][0]
44
+ history[-1][1] = "" # Initialize the bot's response field.
45
 
46
+ # Format the conversation history into the model's expected chat format.
47
+ messages = []
48
+ for human, assistant in history[:-1]: # All but the last interaction
49
  messages.append({"role": "user", "content": human})
50
+ if assistant: # Assistant message might be None
51
  messages.append({"role": "assistant", "content": assistant})
52
  messages.append({"role": "user", "content": user_message})
53
 
54
  try:
55
+ # Apply the chat template to format the prompt.
56
  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
57
+ except Exception as e:
58
+ print(f"⚠️ Warning: Could not apply chat template. Using basic formatting. Error: {e}")
59
+ # Fallback for models without a registered chat template
60
+ prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages]) + "\nassistant:"
61
+
62
+ print("➑️ Generated Prompt for Model:\n" + prompt)
63
+
64
+ # Tokenize the formatted prompt.
65
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
66
 
67
+ # Use TextIteratorStreamer for non-blocking, token-by-token generation.
68
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
69
+
70
+ # Set up the generation parameters in a dictionary.
71
+ generation_kwargs = dict(
72
+ inputs,
73
+ streamer=streamer,
74
+ max_new_tokens=512,
75
  temperature=0.7,
76
  top_p=0.9,
77
+ do_sample=True,
78
+ eos_token_id=tokenizer.eos_token_id,
79
  )
80
 
81
+ # Run the generation in a separate thread to avoid blocking the Gradio UI.
82
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
83
+ thread.start()
84
+
85
+ # Yield each new token to the Gradio chat interface as it's generated.
86
+ for token in streamer:
87
  history[-1][1] += token
88
  yield history
89
 
90
+ # Gradio User Interface
91
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css="footer {display: none !important}") as demo:
92
+ gr.Markdown("## πŸ€– EXAONE-4.0-1.2B")
93
+ gr.Markdown("This demo runs the standard `LGAI-EXAONE/EXAONE-4.0-1.2B` model using the `transformers` library.")
94
+
95
+ chatbot = gr.Chatbot(label="Chat History", height=600, bubble_full_width=False)
96
  with gr.Row():
97
+ msg = gr.Textbox(
98
+ placeholder="Type your message here...",
99
+ label="Your Message",
100
+ scale=8,
101
+ autofocus=True,
102
+ )
103
+ send_btn = gr.Button("Send", scale=1, variant="primary")
104
 
105
+ clear_btn = gr.ClearButton([msg, chatbot], value="πŸ—‘οΈ Clear Chat")
106
+
107
+ # Event Handlers
108
  msg.submit(user_input_handler, [msg, chatbot], [msg, chatbot], queue=False).then(
109
  bot_stream, chatbot, chatbot
110
  )
 
112
  bot_stream, chatbot, chatbot
113
  )
114
 
 
 
115
  demo.queue()
116
+ demo.launch(debug=True)